# setup

In [227]:
#from elasticsearch_dsl import Search, Q, Range, connections
#from elasticsearch.client import Elasticsearch
from tqdm.notebook import tqdm
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import re
from openai import OpenAI
import numpy as np
import ast

In [135]:
EVENTS_FOLDER = "../data/events/"

In [136]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 20)
pd.set_option('display.max_colwidth', None)

In [137]:
df_events = pd.read_pickle(f"{EVENTS_FOLDER}df_events have to add meslis 2025-01-07_22-23-01.pkl")

In [None]:
# shuffle the data rows 
df_events.sample(frac=1).reset_index(drop=True)

# meslis

In [139]:
df_meslis = pd.read_pickle(f"{EVENTS_FOLDER}meslis/df_meslis 2024-12-19_12-41-21.pkl")

In [None]:
df_meslis.isCancelled.value_counts()

In [141]:
flt = df_meslis.isCancelled==True
df_meslis = df_meslis[~flt]

### standardize columns

In [142]:
df_meslis.rename(columns = {
    "id": "origin_id",
    "startDate": "StartDateTime",
    "endDate": "EndDateTime",
    "mergedLocation_name": "LocationName",
    "mergedLocation_address_city": "City",
    "mergedLocation_address_street": "Street",
    "mergedLocation_address_houseNumber": "HouseNumber",
    "mergedLocation_address_streetHouseNumber": "StreetHouseNumber",
    "mergedLocation_coordinate_lat": "lat",
    "mergedLocation_coordinate_lon": "lon", 

    
}, inplace=True)

In [None]:
df_meslis.isDeleted.value_counts()

In [144]:
df_meslis = df_meslis[df_meslis.isDeleted == False]

In [145]:
# for col in df_meslis.columns:
#     if "cate" in col.lower():
#         print(col)
        

In [146]:
# df_meslis.mergedLocation_category.unique()

In [None]:
df_meslis.head()

In [148]:
df_meslis['StartTimeIsAssumed']=False

In [149]:
for col in ["StartDate", "StartTime", "EndTime", "EndDate", "LocationDetails", "EventCategory"]:
    df_meslis[col] = None

In [150]:
def define_meslis_source(ad):
    if ad =="":
        return "unknown"
    for link in ["songkick.com", "partyflock","livenation", "eventbrite.com", "setlist.fm", "bandsintown.com", "wegow.com", "concertful.com", "jambase.com", "facebook.com"]:
        if link in ad:
            return link
    return "other"

In [151]:
df_meslis['eventAdvertisements'] = df_meslis['eventAdvertisements'].fillna('')
df_meslis['Source'] = df_meslis['eventAdvertisements'].apply(define_meslis_source)

In [None]:
df_meslis['Source'].value_counts()

In [None]:
flt = df_meslis['Source']=="unknown"
df_meslis.loc[flt]

In [None]:
df_meslis.columns

In [155]:
df_meslis.rename(columns = {"eventAdvertisements": "EventDescription"}, inplace=True)

In [None]:
for col in df_events.columns:
    if col not in df_meslis.columns:
        print(col)

In [157]:
df_meslis['isOnMultipleDays']=None # TODO add this information

In [158]:
for col in df_meslis.columns:
    if col not in df_events.columns:
        # remove column from df_meslis
        df_meslis.drop(columns=[col], inplace=True, errors='ignore')
        


### date field transformations

In [159]:
# data have incorrect time zone - remove them all before transformation

In [160]:
flt = df_meslis.EndDateTime =='0001-01-01T00:00:00Z'
df_meslis.loc[flt, "EndDateTime"]=None

In [161]:
df_meslis.StartDateTime = df_meslis.StartDateTime.astype(str)
df_meslis.EndDateTime = df_meslis.EndDateTime.astype(str)

In [None]:
df_meslis.StartDateTime

In [163]:
for str_part in ["+02:00", "+01:00", "+00:00", "Z"]:
    for colname in ['StartDateTime', "EndDateTime"]:
        df_meslis[colname] = df_meslis[colname].str.replace(str_part, "", regex=False)

In [164]:
df_meslis ["StartDateTime_len"] = df_meslis.StartDateTime.apply(len)
df_meslis ["EndDateTime_len"] = df_meslis.EndDateTime.apply(lambda x: len(x) if x else 0)

In [None]:
df_meslis.StartDateTime_len.value_counts()

In [None]:
df_meslis.EndDateTime_len.value_counts()

In [167]:
df_meslis.EndDateTime = df_meslis.EndDateTime.str.replace("None", "", regex=False)

In [168]:
# add 1 day to end date when the interval is negative 

In [169]:
df_meslis.StartDateTime = pd.to_datetime(df_meslis.StartDateTime, format='ISO8601')

In [170]:
# sorted(df_meslis.EndDateTime.dropna().unique().tolist())

In [171]:
df_meslis.EndDateTime = pd.to_datetime(df_meslis.EndDateTime, format='ISO8601', yearfirst=True)

In [172]:
df_meslis.drop(columns=["StartDateTime_len", "EndDateTime_len"], inplace=True, errors="ignore")

In [173]:

df_meslis['Duration'] =  df_meslis.EndDateTime - df_meslis.StartDateTime

In [174]:
# sorted(df_meslis.Duration.dropna().unique().tolist())

In [None]:
df_meslis.EndDateTime.dt.hour

In [176]:
# if interval is negative, add 1 day to end date
flt = (df_meslis.Duration.dt.days < 0) 
df_meslis.loc[flt, "EndDateTime"] = df_meslis.loc[flt].EndDateTime + pd.Timedelta(days=1)


In [177]:
df_meslis['Duration'] =  df_meslis.EndDateTime - df_meslis.StartDateTime

In [178]:
flt = df_meslis.StartDateTime == df_meslis.EndDateTime
df_meslis.loc[flt, "EndDateTime"] = None

In [179]:
df_meslis['Duration'] =  df_meslis.EndDateTime - df_meslis.StartDateTime

In [None]:
df_meslis['Duration'].value_counts().sort_index()

In [181]:
df_meslis['isOnMultipleDays']=False


In [182]:
# filter duration > 1 days and 0 hours
flt = df_meslis.Duration > pd.Timedelta(days=1)
df_meslis.loc[flt, "isOnMultipleDays"] = True


In [183]:
flt = df_meslis.isOnMultipleDays==False
df_meslis_to_add = df_meslis[flt].copy()

In [184]:
df_meslis_left = df_meslis[~flt]

In [None]:
flt = df_meslis_left.StartDateTime.dt.hour < df_meslis_left.EndDateTime.dt.hour 
print(len(df_meslis_left))
print(len(df_meslis_left[flt]))

In [None]:
df_events.columns

In [None]:
rows_to_append = []
flt = df_meslis_left.StartDateTime.dt.hour < df_meslis_left.EndDateTime.dt.hour 
for i, row in df_meslis_left[flt].iterrows():
    start_date_time = row["StartDateTime"]
    end_date_time = row["EndDateTime"]
    print(f"{start_date_time=}")
    print(f"{end_date_time=}")
    date_range = pd.date_range(start_date_time.date(), end_date_time.date(), freq='D')
    print(f"{date_range=}")
    
    for date in date_range: 
        row_to_append = row.copy()
        row_to_append["StartDate"] = date.strftime('%Y-%m-%d')
        row_to_append["StartDateTime"] = None
        row_to_append["EndDateTime"] = None
        #hour and minute format
        row_to_append["StartTime"] = start_date_time.strftime('%H:%M')
        row_to_append["EndTime"] = end_date_time.strftime('%H:%M')

        rows_to_append.append(row_to_append)

In [None]:
len(rows_to_append)

In [190]:
df_meslis_left = df_meslis_left[~flt]

In [None]:
rows_to_append_2 = []
for i, row in df_meslis_left.sort_values("Duration").iterrows():
    start_date_time = row["StartDateTime"]
    end_date_time = row["EndDateTime"]
    # print(f"{start_date_time=}")
    # print(f"{end_date_time=}")
    date_range = pd.date_range(start_date_time.date(), end_date_time.date(), freq='D')
    print(f"{date_range=}")
    
    for date in date_range: 
        row_to_append = row.copy()
        row_to_append["StartDate"] = date.strftime('%Y-%m-%d')
        if date.date() == start_date_time.date():
            row_to_append["StartTime"] = start_date_time.strftime('%H:%M')
        else:
            row_to_append["StartTime"] = "00:00"

        if date.date() == end_date_time.date(): 
            row_to_append["EndTime"] = end_date_time.strftime('%H:%M')
        else:
            row_to_append["EndTime"] = "23:59"

        rows_to_append_2.append(row_to_append)
    
    print(rows_to_append_2)
    break

In [193]:
# pd.DataFrame(rows_to_append_2)

In [None]:
df_events = pd.concat([df_events, df_meslis_to_add, pd.DataFrame(rows_to_append), pd.DataFrame(rows_to_append_2)], ignore_index=True)

In [None]:
df_events

In [None]:
# flt = df_meslis.Duration.isna()
# df_meslis.loc[~flt].sort_values("Duration").tail(20)


### remove matched meslis events from gema dataset

In [263]:
df_meslis_original = pd.read_pickle(f"{EVENTS_FOLDER}meslis/df_meslis 2024-12-19_12-41-21.pkl")

In [None]:
len(df_meslis_original)

In [None]:
for id in df_meslis_original.id.tail(30):
    print(id,end = " ")

In [None]:
flt = df_events.origin_id.astype(str).str.contains("1006873109")
df_events[flt]

In [None]:
df_meslis_original.mergedLocation_customers.value_counts()

In [268]:
df_subset = df_meslis_original.loc[:,['id', "customers"]].copy()

In [269]:
df_subset.dropna(subset="customers", inplace=True)

In [270]:
flt = df_subset.customers==""
df_subset= df_subset[~flt]

In [271]:
df_subset.customers = df_subset.customers.apply(lambda x: ast.literal_eval(x))

In [272]:
df_subset['customer_len'] = df_subset.customers.apply(len)

In [None]:
df_subset.customer_len.value_counts()

In [274]:
flt = df_subset.customer_len==2
df_subset.loc[flt, "customer_1"] = df_subset.loc[flt].customers.apply(lambda x: x[0])
df_subset.loc[flt, "customer_2"] = df_subset.loc[flt].customers.apply(lambda x: x[0])

In [275]:
df_subset.loc[df_subset.customer_len==2, "customers"]=None

In [276]:
gema_matched_ids_to_exclude = []

In [277]:
gema_matched_ids_to_exclude.extend(df_subset.customers.dropna().apply(lambda dct: dct.get("matchedEventIds")).dropna().tolist())

In [278]:
gema_matched_ids_to_exclude.extend(df_subset.customer_1.dropna().apply(lambda dct: dct.get("matchedEventIds")).dropna().tolist())

In [279]:
gema_matched_ids_to_exclude.extend(df_subset.customer_2.dropna().apply(lambda dct: dct.get("matchedEventIds")).dropna().tolist())

In [280]:
gema_matched_ids_to_exclude = [el for el in gema_matched_ids_to_exclude if el!=[]]

In [282]:
# flatten a list of lists into a single list
gema_matched_ids_to_exclude = [el for sublist in gema_matched_ids_to_exclude for el in sublist]

In [None]:
len(gema_matched_ids_to_exclude)

In [287]:
gema_matched_ids_to_exclude=[str(el) for el in gema_matched_ids_to_exclude]

In [None]:
len(df_events)

In [None]:
flt = df_events.origin_id.astype(str).isin(gema_matched_ids_to_exclude)
print(len(df_events[flt]))
df_events= df_events[~flt]
print(len(df_events))

In [52]:
df_events.to_pickle(f"{EVENTS_FOLDER}/df_events {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl")