In [1]:
import pandas as pd
from sqlalchemy import create_engine, text
import getpass
import numpy as np
import re

password = getpass.getpass("MySQL root password: ")

DB_NAME = "final_proj_new"

MySQL root password:  ········


In [6]:
# A list of datasets (each as a tuple: path, table name)
datasets = [
    ("../data/clean/election_2017_clean.csv", "election_2017"),
    ("../data/clean/election_2021_clean.csv", "election_2021"),
    ("../data/clean/election_2025_clean.csv", "election_2025"),
    ("../data/raw/Rent_price/rent_growth_berlin_2016-17.csv", "rent_growth_2017"),
    ("../data/raw/Rent_price/rent_growth_berlin_2020-21.csv", "rent_growth_2021"),
    ("../data/raw/Rent_price/rent_growth_berlin_2024-25.csv", "rent_growth_2025"),
    ("../data/raw/Economy/Gini.csv", "Gini_Index"),
    ("../data/raw/Economy/poverty.csv", "Poverty_risk_rates"),
    ("../data/raw/Economy/unemployment.csv", "Unemployment")
]

# Create connection engine
engine = create_engine(f"mysql+pymysql://root:{password}@localhost/{DB_NAME}")

# Loop over all datasets and import them
for csv_path, table_name in datasets:
    print(f"Importing {csv_path} → {table_name}")
    df = pd.read_csv(csv_path, encoding="latin1")

    # Clean column names
    clean_cols = []
    for col in df.columns:
        col = col.strip()
        col = col.replace("\n", " ")
        col = re.sub(r"[^0-9a-zA-Z_]+", "_", col)
        col = col[:50]
        if col == "":
            col = "col_" + str(len(clean_cols)+1)
        clean_cols.append(col.lower())
    df.columns = clean_cols

    # Clean data values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    df.to_sql(table_name.lower(), con=engine, if_exists="replace", index=False)
    print(f"Table '{table_name}' created successfully!\n")

print("All datasets imported successfully")


Importing ../data/clean/election_2017_clean.csv → election_2017
Table 'election_2017' created successfully!

Importing ../data/clean/election_2021_clean.csv → election_2021
Table 'election_2021' created successfully!

Importing ../data/clean/election_2025_clean.csv → election_2025
Table 'election_2025' created successfully!

Importing ../data/raw/Rent_price/rent_growth_berlin_2016-17.csv → rent_growth_2017
Table 'rent_growth_2017' created successfully!

Importing ../data/raw/Rent_price/rent_growth_berlin_2020-21.csv → rent_growth_2021
Table 'rent_growth_2021' created successfully!

Importing ../data/raw/Rent_price/rent_growth_berlin_2024-25.csv → rent_growth_2025
Table 'rent_growth_2025' created successfully!

Importing ../data/raw/Economy/Gini.csv → Gini_Index
Table 'Gini_Index' created successfully!

Importing ../data/raw/Economy/poverty.csv → Poverty_risk_rates
Table 'Poverty_risk_rates' created successfully!

Importing ../data/raw/Economy/unemployment.csv → Unemployment
Table 'Unem

In [7]:
merged_2017 = pd.read_sql("SELECT * FROM merged_2017;", engine)
merged_2021 = pd.read_sql("SELECT * FROM merged_2021;", engine)
merged_2025 = pd.read_sql("SELECT * FROM merged_2025;", engine)

In [10]:
print(merged_2017.shape)
merged_2017.head(10)


(8, 8)


Unnamed: 0,district,all_market_median_m_month_,change_compared_to_last_yr_in_all_segments_,gini_2017,poverty_2017,afd_prc,die_linke_prc,year
0,Mitte,10.89,3.7,0.32,25.6,7.9,20.45,2017
1,Pankow,9.84,7.8,0.26,6.8,11.55,28.33,2017
2,Spandau,7.29,7.9,0.29,24.1,13.57,7.23,2017
3,Steglitz-Zehlendorf,9.8,6.2,0.32,11.1,8.16,7.51,2017
4,Neukoelln,9.47,17.1,0.27,26.2,10.7,16.44,2017
5,Treptow-Koepenick,8.38,6.6,0.27,12.7,14.98,39.87,2017
6,Lichtenberg,8.97,4.5,0.24,17.3,15.72,34.85,2017
7,Reinickendorf,7.9,5.3,0.31,15.5,13.15,7.7,2017
