# Imports

In [201]:
from elasticsearch_dsl import Search, Q, Range, connections
from elasticsearch.client import Elasticsearch
from tqdm.notebook import tqdm
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import re
from openai import OpenAI

In [202]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)

In [203]:
_ES_ENDPOINTS = {
    "prod": {"host": "ha-proxy-elasticsearch", "port": 9200},
    "test": {"host": "test-elasticsearch", "port": 9200},
}
ES_INDEX = "merged_events"

In [204]:
connections.create_connection(alias="ProductionEnvironment", hosts="http://ha-proxy-elasticsearch:9200", timeout= None)

<Elasticsearch([{'host': 'ha-proxy-elasticsearch', 'port': 9200}])>

# meslis

In [4]:
event_search = Search(using="ProductionEnvironment", index="merged_events")

In [5]:
city_query1 = (Q("match", mergedLocation__address__city="Dresden")   )
city_query2 = (Q("match", mergedLocation__address__city="Freiburg")   )
date_query1  =  Q({"range": {"startDate": { "gte": "2024-01-01","lt": "2024-04-01"}}})
date_query2  =  Q({"range": {"startDate": { "gte": "2024-09-03","lt": "2024-11-01"}}})

In [6]:

event_filter = event_search.query(
    (city_query1 | city_query2) & (date_query1 | date_query2)
)
event_filter.count()

3278

In [7]:
# names of postal codes
# gemaNo_address_postalCode
# mergedLocation_address_postalCode

In [7]:
meslis_events = []

for hit in tqdm(event_filter.scan(), total = event_filter.count() ):
    meslis_events.append(hit.to_dict())
    

  0%|          | 0/3276 [00:00<?, ?it/s]

In [8]:
len(meslis_events)

3276

In [9]:
def flatten_dict(d, parent_key='', sep='_'):
    """
    Recursively flatten a nested dictionary.
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            # Recursively flatten nested dictionaries
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            # Handle lists: convert to string (comma-separated)
            items.append((new_key, ', '.join(map(str, v))))
        else:
            items.append((new_key, v))
    return dict(items)


In [10]:
flat_meslis_events= [flatten_dict(event) for event in meslis_events]

In [11]:
df_meslis = pd.DataFrame(flat_meslis_events)

In [12]:
df_meslis.mergedLocation_address_city.unique()

array(['Dresden', 'Freiburg im Breisgau', 'Freiburg im Breisgau, Germany',
       'Dresden, Germany', 'Freiburg', 'Freiburg/Elbe', 'Freiburg (Elbe)',
       'Dresden-Neustadt'], dtype=object)

In [17]:
df_meslis.columns

Index(['id', 'oldIds', 'isDeleted', 'eventName', 'startDate', 'endDate',
       'isCancelled', 'attendingCount', 'isLocationSameAsOrganizer',
       'mergedLocation_customers', 'mergedLocation_id',
       'mergedLocation_oldIds', 'mergedLocation_isDeleted',
       'mergedLocation_allPlaces', 'mergedLocation_statistic',
       'mergedLocation_hiddenForCustomers', 'mergedLocation_onlyForCustomer',
       'mergedLocation_name', 'mergedLocation_address_country_shortName',
       'mergedLocation_address_country_longName',
       'mergedLocation_address_region', 'mergedLocation_address_county',
       'mergedLocation_address_city', 'mergedLocation_address_district',
       'mergedLocation_address_postalCode', 'mergedLocation_address_street',
       'mergedLocation_address_houseNumber',
       'mergedLocation_address_streetHouseNumber',
       'mergedLocation_address_coordinate',
       'mergedLocation_address_addressDescription', 'mergedLocation_category',
       'mergedLocation_commercialId

In [18]:
len(df_meslis)

3276

In [19]:
df_meslis[['startDate', 'id']]

Unnamed: 0,startDate,id
0,2024-02-29T19:30:00,86779378
1,2024-02-20T15:00:00,86779372
2,2024-01-27T09:00:00,79479512
3,2024-01-13T14:00:00+01:00,72481218
4,2024-03-01T10:00:00,83603742
...,...,...
3271,2024-09-17T20:00:00,98434404
3272,2024-09-13T09:00:00+02:00,81225144
3273,2024-10-03T12:00:00,98462848
3274,2024-09-28T20:00:00,98425471


In [20]:
df_meslis[['startDate', 'id']] [df_meslis.startDate.str.endswith("30:00Z")]

Unnamed: 0,startDate,id
35,2024-09-28T15:30:00Z,94461007
36,2024-09-12T19:30:00Z,94461026
37,2024-10-23T20:30:00Z,94470989
558,2024-10-31T19:30:00Z,92446572
1583,2024-10-26T19:30:00Z,92473613
1728,2024-10-04T19:30:00Z,96686263
1878,2024-09-20T19:30:00Z,94461104
1881,2024-09-27T19:30:00Z,94460966
1882,2024-09-19T19:30:00Z,94460977
2106,2024-10-25T19:30:00Z,92463065


In [21]:
# make sure the date filter is correct
# sorted(df_meslis.startDate.str[:10].unique().tolist())

In [77]:
df_meslis.mergedLocation_address_city.unique()

array(['Dresden', 'Freiburg im Breisgau', 'Freiburg im Breisgau, Germany',
       'Dresden, Germany', 'Freiburg', 'Freiburg/Elbe', 'Freiburg (Elbe)',
       'Dresden-Neustadt'], dtype=object)

In [80]:
df_meslis = df_meslis[~df_meslis.mergedLocation_address_city.isin(["Freiburg/Elbe", "Freiburg (Elbe)"])]

In [86]:
sorted(df_meslis.mergedLocation_address_postalCode.unique().tolist())

['01056',
 '01067',
 '01069',
 '01097',
 '01099',
 '01108',
 '01109',
 '01127',
 '01129',
 '01139',
 '01156',
 '01157',
 '01159',
 '01169',
 '01187',
 '01189',
 '01217',
 '01219',
 '01237',
 '01239',
 '01257',
 '01259',
 '01277',
 '01279',
 '01307',
 '01309',
 '01324',
 '01326',
 '01328',
 '01445',
 '01454',
 '01465',
 '01705',
 '01737',
 '01833',
 '01844',
 '1307',
 '1705',
 '21729',
 '78727',
 '79098',
 '79100',
 '79102',
 '79104',
 '79106',
 '79108',
 '79110',
 '79111',
 '79112',
 '79114',
 '79115',
 '79117',
 '79211']

In [89]:
df_meslis = df_meslis[~(df_meslis.mergedLocation_address_postalCode=="21729")]

In [90]:
len(df_meslis)

3273

In [103]:
df_meslis.to_pickle(f"df_meslis {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl")

In [99]:
df_meslis.mergedLocation_address_city.unique()

array(['Dresden', 'Freiburg im Breisgau', 'Freiburg im Breisgau, Germany',
       'Dresden, Germany', 'Freiburg', 'Dresden-Neustadt'], dtype=object)

In [102]:
df_meslis.mergedLocation_address_city = df_meslis.mergedLocation_address_city.str.replace(", Germany", "").str.replace("-Neustadt", "")

# mns

In [22]:
event_search = Search(using="ProductionEnvironment", index="gema_nutzungsfaelle_v5")

In [156]:
# exclude: 4,5,6,7,8 incl

In [23]:
city_query1 = (Q("match", gemaNo__address__city="Dresden")   )
city_query2 = (Q("match", gemaNo__address__city="Freiburg")   )
date_query1  =  Q({"range": {"startDate": { "gte": "2024-01-01","lt": "2024-04-01"}}})
date_query2  =  Q({"range": {"startDate": { "gte": "2024-09-03","lt": "2024-11-01"}}})

In [25]:

event_filter = event_search.query(
    (city_query1 | city_query2) & (date_query1 | date_query2)
)
event_filter.count()

6605

In [26]:
gema_events = []

for hit in tqdm(event_filter.scan(), total = event_filter.count() ):
    gema_events.append(hit.to_dict())
    

  0%|          | 0/6605 [00:00<?, ?it/s]

In [27]:
len(gema_events)

6605

In [28]:
flat_gema_events= [flatten_dict(event) for event in gema_events]

In [29]:
df_gema = pd.DataFrame(flat_gema_events)

In [30]:
df_gema.gemaNo_address_city.unique()

array(['Freiburg im Breisgau', 'Dresden', 'Freiburg (Elbe)', 'Freiburg',
       'Kreischa b Dresden', 'Arnsdorf b Dresden', 'Fischbach b Dresden',
       'Coswig b Dresden', 'Moritzburg b Dresden', 'Wallroda b Dresden'],
      dtype=object)

In [69]:
df_gema = df_gema[~ df_gema.gemaNo_address_city.isin(["Freiburg (Elbe)"])]

In [76]:
# check that postal code is of Freiburg im Breisgau
# df_gema[df_gema.gemaNo_address_city.isin(["Freiburg im Breisgau"])].gemaNo_address_postalCode.sort_values()

In [135]:
#make sure the date filter is correct
#sorted(df_gema.startDate.str[:10].unique().tolist())

In [35]:
len(df_gema)

6605

In [38]:
df_gema.dropna(subset="gemaNo_address_coordinate_lat").gemaNo_address_coordinate_lat

0       47.983291
1       51.050214
2       48.001077
3       51.139809
4       51.051018
          ...    
6600    51.052449
6601    51.052449
6602    51.052449
6603    51.060531
6604    51.060531
Name: gemaNo_address_coordinate_lat, Length: 6597, dtype: float64

In [45]:
df_gema.dropna(subset="gemaNo_address_coordinate_lat")[["gemaNo_address_street", "gemaNo_address_streetHouseNumber"]]

Unnamed: 0,gemaNo_address_street,gemaNo_address_streetHouseNumber
0,Mercystr.,Mercystr. 6-14
1,Wilsdruffer Str.,Wilsdruffer Str. 2
2,Lehener Str.,Lehener Str. 47
3,Platz des Friedens,Platz des Friedens 107
4,Schloßstr.,Schloßstr. 2
...,...,...
6600,Brühlscher Garten,Brühlscher Garten 1
6601,Brühlscher Garten,Brühlscher Garten 1
6602,Brühlscher Garten,Brühlscher Garten 1
6603,An der Dreikönigskirche,An der Dreikönigskirche 1a


In [41]:
#df_gema.gemaNo_address_coordinate_lat.unique()

In [98]:
#check postal codes
#sorted(df_gema.gemaNo_address_postalCode.unique().tolist())

In [152]:
df_gema.to_pickle(f"df_gema {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl")

# Apify

In [8]:
os.walk("Downloads")

<generator object walk at 0x0000024D702469F0>

In [14]:
os.getcwd()

'C:\\Users\\valentyna.sinichenko\\Jupyter\\Thesis'

In [15]:
json_files = []
for root, dirs, files in os.walk('C:\\Users\\valentyna.sinichenko\\Downloads'):
    for file in files:
        if file.endswith(".json"):
            json_files.append(os.path.join(root, file))


In [43]:
all_json_data = []
known_source_urls = set()

In [87]:
processed_files = []

for file_path in tqdm(json_files, total = len(json_files)):
    try:
        with open(file_path, "r", encoding="utf8") as f:
            data = json.load(f)
        if "apifyParsingAddressLogic" in data[0].keys():
            continue
        if "event" not in data[0].keys():
            continue
    
        for record in tqdm(data, total = len(data)):
            cur_source_url = record["event"]["sourceInformation"]["sourceUrl"]
            if cur_source_url not in known_source_urls:
                all_json_data.append(record)
                known_source_urls.add(cur_source_url)
    
        print(f"{len(all_json_data)=}")
        processed_files.append(file_path)
    except Exception as e:
        print(f"{e=}")
        print(file_path.upper())
        
    

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/11512 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/11512 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/11512 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/11692 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/12174 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/12174 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/12145 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/12145 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/12145 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/12145 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/11329 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/12681 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/11664 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/115 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/115 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/1336 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/121472 [00:00<?, ?it/s]

len(all_json_data)=197785
e=JSONDecodeError('Expecting value: line 972661 column 3 (char 33031111)')
C:\USERS\VALENTYNA.SINICHENKO\DOWNLOADS\DATASET_EVENTIM_2024-10-09_12-50-07-771.JSON


  0%|          | 0/16814 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/16814 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/16814 [00:00<?, ?it/s]

len(all_json_data)=197785


  0%|          | 0/15709 [00:00<?, ?it/s]

len(all_json_data)=197785
e=JSONDecodeError('Expecting value: line 734161 column 3 (char 43933376)')
C:\USERS\VALENTYNA.SINICHENKO\DOWNLOADS\DATASET_SKIDDLE_2024-10-26_19-35-48-146.JSON
e=KeyError(0)
C:\USERS\VALENTYNA.SINICHENKO\DOWNLOADS\MAIN - 2024-08-12 08_34_30 (1).JSON
e=KeyError(0)
C:\USERS\VALENTYNA.SINICHENKO\DOWNLOADS\MAIN - 2024-08-12 08_34_30.JSON
e=JSONDecodeError('Extra data: line 110 column 2 (char 6018)')
C:\USERS\VALENTYNA.SINICHENKO\DOWNLOADS\SAMPLE TICKETMASTER DATA.JSON


In [88]:
not_processed_files = [file for file in json_files if file not in processed_files]

In [89]:
not_processed_files

['C:\\Users\\valentyna.sinichenko\\Downloads\\dataset_deecoob-address-parser_2024-10-07_12-55-32-502.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\dataset_deecoob-address-parser_2024-10-09_14-19-37-965.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\dataset_eventim_2024-10-09_12-50-07-771.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\dataset_skiddle_2024-10-26_19-35-48-146.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\Main - 2024-08-12 08_34_30 (1).json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\Main - 2024-08-12 08_34_30.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\sample ticketmaster data.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\old\\dataset_google-maps-extractor-crm-enrichment_2024-05-03_12-41-15-888.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\old 3\\dataset_deecoob-address-parser_2024-10-07_12-55-32-502.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\old 3\\dataset_deecoob-address-parser_2024-10-09_14-19-37-965.json']

In [90]:
not_processed_files = [file for file in not_processed_files if "address-parser" not in file]

In [91]:
not_processed_files = [file for file in not_processed_files if "Main" not in file]

In [92]:
not_processed_files = [file for file in not_processed_files if "google-maps" not in file]

In [93]:
for file_path in tqdm(not_processed_files, total = len(not_processed_files)):
    try:
        with open(file_path, "r") as f:
            data = json.load(f)
        if "apifyParsingAddressLogic" in data[0].keys():
            continue
        if "event" not in data[0].keys():
            continue
    
        for record in tqdm(data, total = len(data)):
            cur_source_url = record["event"]["sourceInformation"]["sourceUrl"]
            if cur_source_url not in known_source_urls:
                all_json_data.append(record)
                known_source_urls.add(cur_source_url)
    
        print(f"{len(all_json_data)=}")
        processed_files.append(file_path)
    except Exception as e:
        print(f"{e=}")
        print(file_path.upper())
        
    

  0%|          | 0/3 [00:00<?, ?it/s]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [96]:
not_processed_files

['C:\\Users\\valentyna.sinichenko\\Downloads\\dataset_eventim_2024-10-09_12-50-07-771.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\dataset_skiddle_2024-10-26_19-35-48-146.json',
 'C:\\Users\\valentyna.sinichenko\\Downloads\\sample ticketmaster data.json']

## keep only relevant cities

In [116]:
all_json_data[0]["location"]["address"]["city"]

'Glasgow'

In [114]:
all_json_data[0]["location"]["address"]["rawAddress"]

In [128]:
{}.get("smth")

In [132]:
all_relevant_location_data = []
cities = []

for record in tqdm(all_json_data):
    try:    
        city = record.get("location", {}).get("address", {}).get("city", "")
    except Exception as e:
        print(e)

        continue

    try:
        raw_address = record.get("location", {}).get("address", {}).get("rawAddress", "")
    except Exception as e:
        print(e)

        continue
        

    if city:
        if "Dresden" in city or "Freiburg" in city:
            print(city)
            all_relevant_location_data.append(record)
            cities.append(city)
            continue
    if raw_address:
        if "Dresden" in raw_address or "Freiburg" in raw_address:
            print(raw_address)
            all_relevant_location_data.append(record)
            continue
            
    
    
    

  0%|          | 0/197785 [00:00<?, ?it/s]

'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
Dresden
'NoneType' object has no attribute 'get'
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
Dresden
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
Dresden
Dresden
Dresden
Freiburg
Freiburg
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
'NoneType' object has no attribute 'get'
Dresden
Dresden
Dresden
Dresden
Dresden
'NoneType' object has no attribute 'get'
Dresden
Dresden
Freiburg
Dresden
Dresden

In [134]:
set(cities)

{'Dresden', 'Freiburg', 'Freiburg (zähringen)', 'Freiburg im breisgau'}

In [135]:
len(all_relevant_location_data)

4276

In [159]:
pd.to_datetime(all_relevant_location_data[200].get("event").get("startDateTime")).tz_convert("UTC")

Timestamp('2024-10-05 18:30:00+0000', tz='UTC')

In [160]:
pd.to_datetime(all_relevant_location_data[200].get("event").get("startDateTime"))

Timestamp('2024-10-05 20:30:00+0200', tz='UTC+02:00')

In [155]:
print())
print(all_relevant_location_data[200].get("event").get("startDateTime"))


TypeError: Cannot localize tz-aware Timestamp, use tz_convert for conversions

In [95]:
# keep only Dresden and Freiburg

In [149]:
dt = pd.to_datetime(all_relevant_location_data[100].get("event").get("startDateTime"))

In [150]:
dt

Timestamp('2024-12-31 13:00:00+0000', tz='UTC')

In [144]:
type(dt)

pandas._libs.tslibs.timestamps.Timestamp

In [145]:
pd.to_datetime("2024-01-01")

Timestamp('2024-01-01 00:00:00')

## keep only relevant dates

In [162]:
all_relevant_location_and_date_data = []
start_date_1 = pd.to_datetime("2024-01-01").tz_localize('UTC')
end_date_1 = pd.to_datetime("2024-04-01").tz_localize('UTC')
start_date_2 = pd.to_datetime("2024-09-01").tz_localize('UTC')
end_date_2 = pd.to_datetime("2024-11-01").tz_localize('UTC')

for record in tqdm(all_relevant_location_data):   
    event_start_date = record.get("event").get("startDateTime")
    event_start_date = pd.to_datetime(event_start_date).tz_convert("UTC")

    if start_date_1 <=event_start_date <= end_date_1 or start_date_2 <=event_start_date <= end_date_2:
        all_relevant_location_and_date_data.append(record)
        
            
    
    
    

  0%|          | 0/4276 [00:00<?, ?it/s]

In [163]:
len(all_relevant_location_and_date_data)

676

In [165]:
with open("apify original DD FB.json", 'w') as json_file:
    json.dump(all_relevant_location_and_date_data, json_file, indent=4) 