# German Citys

### Import Packages

In [1]:
import pandas as pd
import sqlalchemy
import os
from dotenv import load_dotenv, find_dotenv
from functools import wraps
import datetime as dt

### Load variables from .env file

In [2]:
# load env data from .env file.
load_dotenv(find_dotenv(filename='.env'))

True

### Logging Wrapper

In [3]:
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

### Get Citys from CSV

In [4]:
all_qwm_cities = pd.read_csv("../data/csv/owm_city_data.csv")
world_cities = pd.read_csv("../data/csv/worldcities.csv")
world_cities = pd.read_csv("../data/csv/worldcities.csv")
all_qwm_cities.head(3)

  all_qwm_cities = pd.read_csv("../data/csv/owm_city_data.csv")


Unnamed: 0,id,name,state,country,coord.lon,coord.lat
0,833.0,Ḩeşār-e Sefīd,,IR,47.159401,34.330502
1,2960.0,‘Ayn Ḩalāqīm,,SY,36.321911,34.940079
2,3245.0,Taglag,,IR,44.98333,38.450001


## Data Cleaning Pipeline

### Init Pipeline

In [5]:
@log_step
def init_pipeline(df):
    return  df.copy()

### Rename Columns

In [6]:
@log_step
def rename_columns(df):
    return  (
    df.rename(columns={
        "id": "city_id",
        "name": "city_name",
        "state": "city_state",
        "country": "city_country",
        "coord.lon": "city_longitude",
        "coord.lat": "city_latitude"
    })
    )

### Drop Columns

In [7]:
@log_step
def drop_columns(df):
    return  df.drop(columns=["city_state"])

### Add Columns

In [8]:
@log_step
def add_columns(df):
    city_population = world_cities.assign(municipality_country = lambda x:  x["city_ascii"]+ "," + x["iso2"])[["municipality_country", "population"]]
    city_population = city_population.rename(columns={"population": "city_pop"})
    return  (
        df
        .assign(municipality_country = lambda x: x["city_name"] + "," + x["city_country"])
        .merge(city_population, how="left")
        .dropna()
        .reset_index(drop=True)
    )

### Drop Duplicates

In [9]:
@log_step
def drop_duplicates(df):
    return  df.drop_duplicates(subset="municipality_country")

### Get German Citys

In [10]:
@log_step
def get_german_cities(df):
    return  (
        df
            .loc[df["city_country"] == "DE"]
            .reset_index(drop=True)
    )  

### Adjust Datatypes

In [11]:

def adjust_datatypes(df):
    df["city_id"] = df["city_id"].astype("int64").astype("string")
    df["city_name"] = df["city_name"].astype("string")
    df["city_country"] = df["city_country"].astype("string")
    df["city_longitude"] = df["city_longitude"].astype("float32")
    df["city_latitude"] = df["city_latitude"].astype("float32")
    df["municipality_country"] = df["municipality_country"].astype("string")
    df["city_pop"] = df["city_pop"].astype("int")
    return (
        df
            .sort_values("city_pop", ascending=False)
            .reset_index(drop=True)
    )

### Send to DB

In [12]:
def send_to_DB(df, table_name, if_exists="replace"):      
    con = f'mysql+pymysql://{os.environ["DB_USER"]}:{os.environ["DB_PASSWORD"]}@{os.environ["DB_HOST"]}:{os.environ["DB_PORT"]}/{os.environ["DB_SCHEMA"]}'
    return df.to_sql(table_name, con=con, if_exists=if_exists, index=False)


## RUN Pipeline

In [13]:
german_cities = (
    all_qwm_cities
        .pipe(init_pipeline)
        .pipe(rename_columns)
        .pipe(drop_columns)
        .pipe(add_columns) # Population, Drop Citys with no Population data
        .pipe(drop_duplicates)
        .pipe(get_german_cities)
        .pipe(adjust_datatypes)
        # Send to DB in cell below
)
german_cities.info()

init_pipeline:
 shape=(209579, 6) took 0:00:00.009449s

rename_columns:
 shape=(209579, 6) took 0:00:00.011725s

drop_columns:
 shape=(209579, 5) took 0:00:00.004909s

add_columns:
 shape=(54793, 7) took 0:00:00.433742s

drop_duplicates:
 shape=(26649, 7) took 0:00:00.012868s

get_german_cities:
 shape=(2127, 7) took 0:00:00.002986s

adjust_datatypes:
 shape=(2127, 7) took 0:00:00.007297s

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   city_id               2127 non-null   string 
 1   city_name             2127 non-null   string 
 2   city_country          2127 non-null   string 
 3   city_longitude        2127 non-null   float32
 4   city_latitude         2127 non-null   float32
 5   municipality_country  2127 non-null   string 
 6   city_pop              2127 non-null   int64  
dtypes: float32(2), int64(1), string(4)
me

In [14]:
send_to_DB(df=german_cities, table_name="cities", if_exists="replace")

2127