In [None]:
from elasticsearch_dsl import Search, Q, Range, connections
from elasticsearch.client import Elasticsearch
from tqdm.notebook import tqdm
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import re
import numpy as np
import ast

In [77]:
df = pd.read_csv(f"../data/events/df_events 2025-01-14_18-50-06.csv", index_col=0)

In [78]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 40)
pd.set_option('display.max_colwidth', 400)

In [None]:
len(df[df.lat.isna()])

In [None]:
df[df.lat.isna()]["Source"].value_counts()

In [None]:
# table  of null values
df_tmp = df[df.lat.isna()]

(df_tmp.isnull().mean() * 100).sort_values(ascending=False)

In [None]:
df[df.lat.isna()].head(20)

In [83]:
ES_INDEX = "merged_locations"

In [None]:
connections.create_connection(alias="ProductionEnvironment", hosts="http://ha-proxy-elasticsearch:9200", timeout= None)

# practice single request 

In [115]:
merged_locations_search = Search(using="ProductionEnvironment", index="merged_locations")
nutzungsorte_search = Search(using="ProductionEnvironment", index="gema_nutzungsorte_v5")

In [86]:
city_query = (Q("match", address__city="Dresden")   )
name_query = (Q("match", name="Zwinger")   )
hn_query = (Q("match", address__streetHouseNumber="Sophienstraße")   )



In [None]:
filter = merged_locations_search.query(
    city_query & name_query & hn_query
)
filter.count()

In [None]:
locs = []

for hit in tqdm(filter.scan(), total = filter.count() ):
    locs.append(hit.to_dict())

In [89]:
def flatten_dict(d, parent_key='', sep='_'):
    """
    Recursively flatten a nested dictionary.
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            # Recursively flatten nested dictionaries
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            # Handle lists: convert to string (comma-separated)
            items.append((new_key, ', '.join(map(str, v))))
        else:
            items.append((new_key, v))
    return dict(items)


In [90]:
locs= [flatten_dict(loc) for loc in locs]

In [91]:
df_locs = pd.DataFrame(locs)

In [None]:
df_locs.columns

In [93]:
df_locs.dropna(subset=["address_coordinate_lat", "address_coordinate_lon"], inplace=True)

In [None]:
df_locs[["address_coordinate_lat", "address_coordinate_lon"]]

In [95]:
def has_six_or_more_decimals(x):
    return any(
        isinstance(val, float) and len(str(val).split('.')[1]) >= 6
        for val in x.values
    )


In [96]:
df_locs = df_locs[df_locs.apply(has_six_or_more_decimals, axis=1)]

In [None]:
df_locs[["address_coordinate_lat", "address_coordinate_lon"]]

In [98]:
avg_lat, avg_lon = df_locs[["address_coordinate_lat", "address_coordinate_lon"]].mean()

In [None]:
avg_lat

# loop to extract coordinates

### 1

In [100]:
merged_locations_search = Search(using="ProductionEnvironment", index="merged_locations")

In [None]:
flt = (df.lat.isna()) & (~df.StreetHouseNumber.isna()) & (~df.LocationName.isna())
print(len(df[flt]))
for i, row in df[flt].iterrows():
    city = row["City"]
    name = row["LocationName"]
    street_hn = row["StreetHouseNumber"]
    city_query = (Q("match", address__city=city)   )
    name_query = (Q("match", name=name  ))
    street_hn_query = (Q("match", address__streetHouseNumber=street_hn) )
    filter = merged_locations_search.query(city_query & name_query & street_hn_query)
    if filter.count() ==0:
        print("Location not found")
        continue
    
    locs = []

    for hit in filter.scan():
        locs.append(hit.to_dict())

    print(len(locs))

    locs= [flatten_dict(loc) for loc in locs]

    df_locs = pd.DataFrame(locs)
    coordinate_cols = ["coordinate_lat", "coordinate_lon"]
    if "coordinate_lat" not in df_locs.columns:
        coordinate_cols = ["address_coordinate_lat", "address_coordinate_lon"]
    if "address_coordinate_lat" not in df_locs.columns:
        print("No coordinate columns found")
        continue
            
    df_locs.dropna(subset=coordinate_cols, inplace=True)
    if df_locs.empty:
        print("No coordinates found")
        continue
    
    df_locs = df_locs[coordinate_cols]
    df_locs = df_locs[df_locs.apply(has_six_or_more_decimals, axis=1)]
    if df_locs.empty:
        print("No precise coordinates")
        continue

    avg_lat, avg_lon = df_locs.mean()
    df.at[i, "lat"] = avg_lat
    df.at[i, "lon"] = avg_lon


In [None]:
len(df[df.lat.isna()])

In [None]:
(df[df.lat.isna()].isnull().mean() * 100).sort_values(ascending=False)

In [None]:
df[df.lat.isna() & ~df.StreetHouseNumber.isna() & ~df.LocationName.isna()]

### 2

In [None]:
flt = (df.lat.isna()) & (~df.StreetHouseNumber.isna()) & (~df.LocationName.isna())
print(len(df[flt]))
for i, row in df[flt].iterrows():
    city = row["City"]
    name = row["LocationName"]
    street_hn = row["StreetHouseNumber"]
    city_query = (Q("match", address__city=city)   )
    name_query = (Q("match", name=name  ))
    street_hn_query = (Q("match", address__streetHouseNumber=street_hn) )
    filter = nutzungsorte_search.query(city_query & name_query & street_hn_query)
    if filter.count() ==0:
        print("Location not found")
        continue
    
    locs = []

    for hit in filter.scan():
        locs.append(hit.to_dict())

    print(len(locs))

    locs= [flatten_dict(loc) for loc in locs]

    df_locs = pd.DataFrame(locs)
    coordinate_cols = ["coordinate_lat", "coordinate_lon"]
    if "coordinate_lat" not in df_locs.columns:
        coordinate_cols = ["address_coordinate_lat", "address_coordinate_lon"]
    if "address_coordinate_lat" not in df_locs.columns:
        print("No coordinate columns found")
        continue
            
    df_locs.dropna(subset=coordinate_cols, inplace=True)
    if df_locs.empty:
        print("No coordinates found")
        continue
    
    df_locs = df_locs[coordinate_cols]
    df_locs = df_locs[df_locs.apply(has_six_or_more_decimals, axis=1)]
    if df_locs.empty:
        print("No precise coordinates")
        continue

    avg_lat, avg_lon = df_locs.mean()
    df.at[i, "lat"] = avg_lat
    df.at[i, "lon"] = avg_lon


In [None]:
len(df[df.lat.isna()])

### 3

In [None]:
df[df.lat.isna()]

In [None]:
flt = (df.lat.isna()) & (~df.LocationName.isna())
print(len(df[flt]))
for i, row in df[flt].iterrows():
    city = row["City"]
    name = row["LocationName"]
    city_query = (Q("match", address__city=city)   )
    name_query = (Q("match", name=name  ))
    filter = nutzungsorte_search.query(city_query & name_query)
    if filter.count() ==0:
        print(f"Location {name} not found")
        continue
    
    locs = []

    for hit in filter.scan():
        locs.append(hit.to_dict())

    print(len(locs))

    locs= [flatten_dict(loc) for loc in locs]

    df_locs = pd.DataFrame(locs)
    coordinate_cols = ["coordinate_lat", "coordinate_lon"]
    if "coordinate_lat" not in df_locs.columns:
        coordinate_cols = ["address_coordinate_lat", "address_coordinate_lon"]
    if "address_coordinate_lat" not in df_locs.columns:
        print("No coordinate columns found")
        continue
            
    df_locs.dropna(subset=coordinate_cols, inplace=True)
    if df_locs.empty:
        print("No coordinates found")
        continue
    
    df_locs = df_locs[coordinate_cols]
    df_locs = df_locs[df_locs.apply(has_six_or_more_decimals, axis=1)]
    if df_locs.empty:
        print("No precise coordinates")
        continue

    avg_lat, avg_lon = df_locs.mean()
    df.at[i, "lat"] = avg_lat
    df.at[i, "lon"] = avg_lon
    print("Success")


In [None]:
len(df[df.lat.isna()])

In [None]:
df[df.lat.isna()]

### 4

In [None]:
flt = (df.lat.isna()) & (~df.LocationName.isna())
print(len(df[flt]))
for i, row in df[flt].iterrows():
    city = row["City"]
    name = row["LocationName"]
    city_query = (Q("match", address__city=city)   )
    name_query = (Q("match", name=name  ))
    filter = merged_locations_search.query(city_query & name_query)
    if filter.count() ==0:
        print(f"Location {name} not found")
        continue
    
    locs = []

    for hit in filter.scan():
        locs.append(hit.to_dict())

    print(len(locs))

    locs= [flatten_dict(loc) for loc in locs]

    df_locs = pd.DataFrame(locs)
    coordinate_cols = ["coordinate_lat", "coordinate_lon"]
    if "coordinate_lat" not in df_locs.columns:
        coordinate_cols = ["address_coordinate_lat", "address_coordinate_lon"]
    if "address_coordinate_lat" not in df_locs.columns:
        print("No coordinate columns found")
        continue
            
    df_locs.dropna(subset=coordinate_cols, inplace=True)
    if df_locs.empty:
        print("No coordinates found")
        continue
    
    df_locs = df_locs[coordinate_cols]
    df_locs = df_locs[df_locs.apply(has_six_or_more_decimals, axis=1)]
    if df_locs.empty:
        print("No precise coordinates")
        continue

    avg_lat, avg_lon = df_locs.mean()
    df.at[i, "lat"] = avg_lat
    df.at[i, "lon"] = avg_lon
    print("Success")


In [None]:
len(df[df.lat.isna()])

In [None]:
df[df.lat.isna()]

### 5

In [132]:
df.StreetHouseNumber = 	df.StreetHouseNumber.str.replace("Veranstaltungsort\n", "")

In [137]:
df.LocationDetails = 	df.LocationDetails.str.replace("Veranstaltungsort\n", "")

In [None]:
flt = df.lat.isna() & df.StreetHouseNumber.isna() & ~df.LocationDetails.isna()
df.loc[flt, "LocationDetails"].drop_duplicates()

In [140]:
flt = df.lat.isna() & df.StreetHouseNumber.isna() & ~df.LocationDetails.isna()
df.loc[flt, "StreetHouseNumber"] = df.loc[flt, "LocationDetails"]

In [None]:
flt = (df.lat.isna()) & (~df.StreetHouseNumber.isna()) 
print(len(df[flt]))
for i, row in df[flt].iterrows():
    city = row["City"]
    street_hn = row["StreetHouseNumber"]
    city_query = (Q("match", address__city=city)   )
    street_hn_query = (Q("match", address__streetHouseNumber=street_hn) )
    filter = nutzungsorte_search.query(city_query  & street_hn_query)
    if filter.count() ==0:
        print("Location not found")
        continue
    
    locs = []

    for hit in filter.scan():
        locs.append(hit.to_dict())

    print(len(locs))

    locs= [flatten_dict(loc) for loc in locs]

    df_locs = pd.DataFrame(locs)
    coordinate_cols = ["coordinate_lat", "coordinate_lon"]
    if "coordinate_lat" not in df_locs.columns:
        coordinate_cols = ["address_coordinate_lat", "address_coordinate_lon"]
    if "address_coordinate_lat" not in df_locs.columns:
        print("No coordinate columns found")
        continue
            
    df_locs.dropna(subset=coordinate_cols, inplace=True)
    if df_locs.empty:
        print("No coordinates found")
        continue
    
    df_locs = df_locs[coordinate_cols]
    df_locs = df_locs[df_locs.apply(has_six_or_more_decimals, axis=1)]
    if df_locs.empty:
        print("No precise coordinates")
        continue

    avg_lat, avg_lon = df_locs.mean()
    df.at[i, "lat"] = avg_lat
    df.at[i, "lon"] = avg_lon


In [None]:
flt = (df.lat.isna()) & (~df.StreetHouseNumber.isna()) 
print(len(df[flt]))
for i, row in df[flt].iterrows():
    city = row["City"]
    street_hn = row["StreetHouseNumber"]
    city_query = (Q("match", address__city=city)   )
    street_hn_query = (Q("match", address__streetHouseNumber=street_hn) )
    filter = merged_locations_search.query(city_query  & street_hn_query)
    if filter.count() ==0:
        print("Location not found")
        continue
    
    locs = []

    for hit in filter.scan():
        locs.append(hit.to_dict())

    print(len(locs))

    locs= [flatten_dict(loc) for loc in locs]

    df_locs = pd.DataFrame(locs)
    coordinate_cols = ["coordinate_lat", "coordinate_lon"]
    if "coordinate_lat" not in df_locs.columns:
        coordinate_cols = ["address_coordinate_lat", "address_coordinate_lon"]
    if "address_coordinate_lat" not in df_locs.columns:
        print("No coordinate columns found")
        continue
            
    df_locs.dropna(subset=coordinate_cols, inplace=True)
    if df_locs.empty:
        print("No coordinates found")
        continue
    
    df_locs = df_locs[coordinate_cols]
    df_locs = df_locs[df_locs.apply(has_six_or_more_decimals, axis=1)]
    if df_locs.empty:
        print("No precise coordinates")
        continue

    avg_lat, avg_lon = df_locs.mean()
    df.at[i, "lat"] = avg_lat
    df.at[i, "lon"] = avg_lon


In [None]:
len(df[df.lat.isna()])

In [None]:
df[df.lat.isna()]

In [None]:
flt = df.origin_id == 10430
df.loc[flt, "lat"] = 47.9863569125
df.loc[flt, "lon"] = 7.872496387500001

In [157]:
flt = df.origin_id == 10468
df.loc[flt, "lat"] = 48.0126038
df.loc[flt, "lon"] = 7.8136792

In [148]:
flt = df.LocationName == "Hofewiese"
df.loc[flt, "lat"] =  51.1099121
df.loc[flt, "lon"] =  13.8321601

In [None]:
flt = (df.LocationDetails == "Freigelände Messe") & (df.City == "Freiburg")
df.loc[flt, "lat"] = 48.0157012
df.loc[flt, "lon"] =  7.8399441

In [None]:
df[['lat', 'lon']].describe()

In [None]:
df.City.value_counts()

In [None]:
for city in df.City.unique():
    print(city)
    print(df[df.City == city][["lat", "lon"]].describe())

# Save

In [163]:
time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

In [None]:
time

In [165]:
df.to_csv(f"../data/events/df_events will all coordinates {time}.csv")

In [2]:
# switch to another environment
import pandas as pd


In [3]:
df = pd.read_csv("../data/events/df_events will all coordinates 2025-01-16_18-03-13.csv", index_col=0)

In [4]:
df.to_pickle("../data/events/df_events will all coordinates 2025-01-16_18-03-13.pkl")