# Imports

In [1]:
from elasticsearch_dsl import Search, Q, Range, connections
from elasticsearch.client import Elasticsearch
from tqdm.notebook import tqdm
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import re
#from openai import OpenAI

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)

In [3]:
_ES_ENDPOINTS = {
    "prod": {"host": "ha-proxy-elasticsearch", "port": 9200},
    "test": {"host": "test-elasticsearch", "port": 9200},
}
ES_INDEX = "merged_events"

In [None]:
connections.create_connection(alias="ProductionEnvironment", hosts="http://ha-proxy-elasticsearch:9200", timeout= None)

# meslis

In [5]:
event_search = Search(using="ProductionEnvironment", index="merged_events")

In [6]:
city_query_Dresden = (Q("match", mergedLocation__address__city="Dresden")   )
city_query_Freiburg = (Q("match", mergedLocation__address__city="Freiburg")   ) # does this include work inclusion for "Freiburg am Breisgau", etc. - yes it does
date_query_DD  =  Q({"range": {"startDate": { "gte": "2024-01-01","lt": "2024-04-01"}}})
date_query_FB  =  Q({"range": {"startDate": { "gte": "2023-06-01","lt": "2023-08-01"}}})
date_query_both  =  Q({"range": {"startDate": { "gte": "2024-09-03","lt": "2024-11-01"}}})

In [None]:
event_filter = event_search.query(
    ((city_query_Dresden | city_query_Freiburg) & date_query_both) | \
    (city_query_Dresden & date_query_DD) | \
    (city_query_Freiburg & date_query_FB)
)
event_filter.count()

In [8]:
# names of postal codes
# gemaNo_address_postalCode
# mergedLocation_address_postalCode

In [None]:
meslis_events = []

for hit in tqdm(event_filter.scan(), total = event_filter.count() ):
    meslis_events.append(hit.to_dict())
    

In [None]:
len(meslis_events)

In [11]:
def flatten_dict(d, parent_key='', sep='_'):
    """
    Recursively flatten a nested dictionary.
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            # Recursively flatten nested dictionaries
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            # Handle lists: convert to string (comma-separated)
            items.append((new_key, ', '.join(map(str, v))))
        else:
            items.append((new_key, v))
    return dict(items)


In [12]:
flat_meslis_events= [flatten_dict(event) for event in meslis_events]

In [13]:
df_meslis = pd.DataFrame(flat_meslis_events)

In [None]:
df_meslis.mergedLocation_address_city.unique()

In [None]:
df_meslis.columns

In [None]:
len(df_meslis)

In [None]:
df_meslis[['startDate', 'id']].head()

In [None]:
df_meslis[['startDate', 'id']] [df_meslis.startDate.str.endswith("30:00Z")]

In [None]:
sorted(df_meslis.startDate.str[:10].unique().tolist())

In [None]:
df_meslis.mergedLocation_address_city.unique()

In [23]:
df_meslis = df_meslis[~df_meslis.mergedLocation_address_city.isin(["Freiburg/Elbe", "Freiburg (Elbe)"])]

In [None]:
sorted(df_meslis.mergedLocation_address_postalCode.unique().tolist())

In [25]:
df_meslis = df_meslis[~(df_meslis.mergedLocation_address_postalCode=="21729")]

In [None]:
len(df_meslis)

In [None]:
df_meslis.mergedLocation_address_city.unique()

In [29]:
df_meslis.mergedLocation_address_city = df_meslis.mergedLocation_address_city.str.replace(", Germany", "").str.replace("-Neustadt", "")

In [31]:
df_meslis.to_pickle(f"../data/events/meslis/df_meslis {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl")

# mns

In [32]:
event_search = Search(using="ProductionEnvironment", index="gema_nutzungsfaelle_v5")

In [33]:
city_query_Dresden = (Q("match", gemaNo__address__city="Dresden")   )
city_query_Freiburg = (Q("match", gemaNo__address__city="Freiburg")   )
date_query_DD  =  Q({"range": {"startDate": { "gte": "2024-01-01","lt": "2024-04-01"}}})
date_query_FB  =  Q({"range": {"startDate": { "gte": "2023-06-01","lt": "2023-08-01"}}})
date_query_both =  Q({"range": {"startDate": { "gte": "2024-09-03","lt": "2024-11-01"}}})

In [None]:
event_filter = event_search.query(
    ((city_query_Dresden | city_query_Freiburg) & date_query_both) | \
    (city_query_Dresden & date_query_DD) | \
    (city_query_Freiburg & date_query_FB)
)
event_filter.count()

In [None]:
gema_events = []

for hit in tqdm(event_filter.scan(), total = event_filter.count() ):
    gema_events.append(hit.to_dict())
    

In [None]:
len(gema_events)

In [37]:
flat_gema_events= [flatten_dict(event) for event in gema_events]

In [38]:
df_gema = pd.DataFrame(flat_gema_events)

In [None]:
df_gema.gemaNo_address_city.unique()

In [40]:
df_gema = df_gema[~ df_gema.gemaNo_address_city.isin(["Freiburg (Elbe)"])]

In [None]:
# check that postal code is of Freiburg im Breisgau
df_gema.loc[df_gema.gemaNo_address_city.str.contains("Freiburg"), "gemaNo_address_postalCode"].unique()

In [None]:
#make sure the date filter is correct
sorted(df_gema.startDate.str[:10].unique().tolist())

In [None]:
len(df_gema)

In [None]:
df_gema.dropna(subset="gemaNo_address_coordinate_lat").gemaNo_address_coordinate_lat

In [None]:
df_gema.dropna(subset="gemaNo_address_coordinate_lat")[["gemaNo_address_street", "gemaNo_address_streetHouseNumber"]].head()

In [41]:
#df_gema.gemaNo_address_coordinate_lat.unique()

In [None]:
#check postal codes
sorted(df_gema.gemaNo_address_postalCode.unique().tolist())

In [49]:
df_gema.to_pickle(f"../data/events/gema/df_gema {datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl")

# Apify

In [None]:
os.walk("Downloads")

In [None]:
os.getcwd()

In [15]:
json_files = []
for root, dirs, files in os.walk('C:\\Users\\valentyna.sinichenko\\Downloads'):
    for file in files:
        if file.endswith(".json"):
            json_files.append(os.path.join(root, file))


In [43]:
all_json_data = []
known_source_urls = set()

In [None]:
processed_files = []

for file_path in tqdm(json_files, total = len(json_files)):
    try:
        with open(file_path, "r", encoding="utf8") as f:
            data = json.load(f)
        if "apifyParsingAddressLogic" in data[0].keys():
            continue
        if "event" not in data[0].keys():
            continue
    
        for record in tqdm(data, total = len(data)):
            cur_source_url = record["event"]["sourceInformation"]["sourceUrl"]
            if cur_source_url not in known_source_urls:
                all_json_data.append(record)
                known_source_urls.add(cur_source_url)
    
        print(f"{len(all_json_data)=}")
        processed_files.append(file_path)
    except Exception as e:
        print(f"{e=}")
        print(file_path.upper())
        
    

In [88]:
not_processed_files = [file for file in json_files if file not in processed_files]

In [None]:
not_processed_files

In [90]:
not_processed_files = [file for file in not_processed_files if "address-parser" not in file]

In [91]:
not_processed_files = [file for file in not_processed_files if "Main" not in file]

In [92]:
not_processed_files = [file for file in not_processed_files if "google-maps" not in file]

In [None]:
for file_path in tqdm(not_processed_files, total = len(not_processed_files)):
    try:
        with open(file_path, "r") as f:
            data = json.load(f)
        if "apifyParsingAddressLogic" in data[0].keys():
            continue
        if "event" not in data[0].keys():
            continue
    
        for record in tqdm(data, total = len(data)):
            cur_source_url = record["event"]["sourceInformation"]["sourceUrl"]
            if cur_source_url not in known_source_urls:
                all_json_data.append(record)
                known_source_urls.add(cur_source_url)
    
        print(f"{len(all_json_data)=}")
        processed_files.append(file_path)
    except Exception as e:
        print(f"{e=}")
        print(file_path.upper())
        
    

In [None]:
not_processed_files

## keep only relevant cities

In [None]:
all_json_data[0]["location"]["address"]["city"]

In [114]:
all_json_data[0]["location"]["address"]["rawAddress"]

In [128]:
{}.get("smth")

In [None]:
all_relevant_location_data = []
cities = []

for record in tqdm(all_json_data):
    try:    
        city = record.get("location", {}).get("address", {}).get("city", "")
    except Exception as e:
        print(e)

        continue

    try:
        raw_address = record.get("location", {}).get("address", {}).get("rawAddress", "")
    except Exception as e:
        print(e)

        continue
        

    if city:
        if "Dresden" in city or "Freiburg" in city:
            print(city)
            all_relevant_location_data.append(record)
            cities.append(city)
            continue
    if raw_address:
        if "Dresden" in raw_address or "Freiburg" in raw_address:
            print(raw_address)
            all_relevant_location_data.append(record)
            continue
            
    
    
    

In [None]:
set(cities)

In [None]:
len(all_relevant_location_data)

In [None]:
pd.to_datetime(all_relevant_location_data[200].get("event").get("startDateTime")).tz_convert("UTC")

In [None]:
pd.to_datetime(all_relevant_location_data[200].get("event").get("startDateTime"))

In [None]:

print(all_relevant_location_data[200].get("event").get("startDateTime"))


In [95]:
# keep only Dresden and Freiburg

In [149]:
dt = pd.to_datetime(all_relevant_location_data[100].get("event").get("startDateTime"))

In [None]:
dt

In [None]:
type(dt)

In [None]:
pd.to_datetime("2024-01-01")

## keep only relevant dates

In [None]:
all_relevant_location_and_date_data = []
start_date_1 = pd.to_datetime("2024-01-01").tz_localize('UTC')
end_date_1 = pd.to_datetime("2024-04-01").tz_localize('UTC')
start_date_2 = pd.to_datetime("2024-09-01").tz_localize('UTC')
end_date_2 = pd.to_datetime("2024-11-01").tz_localize('UTC')

for record in tqdm(all_relevant_location_data):   
    event_start_date = record.get("event").get("startDateTime")
    event_start_date = pd.to_datetime(event_start_date).tz_convert("UTC")

    if start_date_1 <=event_start_date <= end_date_1 or start_date_2 <=event_start_date <= end_date_2:
        all_relevant_location_and_date_data.append(record)
        
            
    
    
    

In [None]:
len(all_relevant_location_and_date_data)

In [165]:
with open("apify original DD FB.json", 'w') as json_file:
    json.dump(all_relevant_location_and_date_data, json_file, indent=4) 