This notebook is transform from "data_clean.py"

In [33]:
#All functions were transfered from the notebook "Nicha-data-preproc",
##Don't forget to drop high-corr-features!!!
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

def get_full_data(filepath, filename):
    df = gpd.read_file(f"{filepath}/{filename}")
    return df

def clean_data(df):
    unused_column = "Unnamed: 0"
    if unused_column in df.keys():
        df = df.drop(axis=1, columns=["Unnamed: 0"])
    return df

def change_age_bin(df):
    """Change the bin of Population-Age (E)"""
    elist = ['E_EU1', 'E_E1U6', 'E_E6U15', 'E_E15U18',
                'E_E18U25','E_E25U55','E_E55U65','E_E65U80', 'E_E80U110']
    for e in elist:
        df[e] = df[e] * df["E_E"]
    df['E_U18'] = df['E_EU1'] + df['E_E1U6'] + df['E_E6U15'] + df['E_E15U18']
    df['E_E25U65'] = df['E_E25U55'] + df['E_E55U65']
    df['E_E65U110'] =  df['E_E65U80'] + df['E_E80U110']
    df.drop(columns=['E_E','E_EU1', 'E_E1U6', 'E_E6U15', 'E_E15U18','E_E25U55','E_E55U65','E_E65U80', 'E_E80U110'], inplace=True)
    return df

def change_building_bin(df):
    """Change the bin of Building-Age"""
    df['B_1940'] = df['bis_1900'] + df['x1901_1910'] + df['1911-1920'] + df['1921-1930']+ df['1930-1940']
    df['B_1941_1990'] = df['1941_1950'] + df['1951_1960'] + df['1961-1970'] + df['1971-1980'] + df['1980-1990']
    df['B_1991_2000'] = df['1991-2000'] + df['2001-2010'] + df['2010-2015']
    df.drop(columns=['bis_1900','x1901_1910','1911-1920', '1921-1930', '1930-1940',
    '1941_1950', '1951_1960','1961-1970', '1971-1980','1980-1990',
    '1991-2000', '2001-2010','2010-2015'], inplace=True)
    return df


def features_corr(df):
    """create list of pearson correlation"""
    corr = df.corr()
    corr_df = corr.unstack().reset_index() #Unstack correlation matrix
    corr_df.columns = ["feature_1", "feature_2", "correlation"] #Rename the columns
    corr_df.sort_values(by="correlation", ascending=False, inplace=True)
    corr_df = corr_df[corr_df["feature_1"] != corr_df["feature_2"]] #remove the self-corr
    return corr_df


def get_final_data(filepath, filename):
    """filepath is the direktory  + name of the data,
    and the filename of the data to be cleaned.
    The final data will be save in the same directory with the name --> final_data <--"""
    df = get_full_data(filepath, filename)
    df = clean_data(df)
    df = change_age_bin(df)
    df = change_building_bin(df)
    df.set_index('PLR_ID', inplace = True)
    df.rename(columns={'activities' : "economic",
        'activiti_1' :"education",'activiti_2' : "health_care",'activiti_3': "public_service"}, inplace=True)
    df.drop(columns=['ant_arbeit', 'ant_transf', 'ant_arbe_1', 'ant_tran_1', 'Kinderar_1',
            'aenderung_', 'wohnungsve','E_EM','E_EW', 'MH_EM', 'MH_EW', 'MH_U1', 'MH_1U6',
           'MH_6U15', 'MH_15U18', 'MH_18U25', 'MH_25U55', 'MH_55U65', 'MH_65U80',
           'MH_80U110','anteil_lei','wohnungs_2','Nummer', 'Name', 'EW','BEZ',
            'BZR_ID', 'PGR_ID', 'ew2015', 'index_left','mobility_b', 'mobility_1'], inplace = True)
    #drop the rows that missing y
    df = df[df['Kinderarmu'].notna()]
    #df.to_csv(f"{filepath}/final_data.csv")
    #df.to_file(f"{filepath}/final_data.shp")
    #print(f" See the final data with this shape {df.shape} csv and shp file in {filepath}")
    return df

In [5]:
df_previous = get_full_data("../raw_data/final_data", "full.shp")
df_previous.head(3)

Unnamed: 0,E_E,E_EM,E_EW,E_EU1,E_E1U6,E_E6U15,E_E15U18,E_E18U25,E_E25U55,E_E55U65,...,spaces_wat,social_com,social_cul,social_eat,social_nig,activities,activiti_1,activiti_2,activiti_3,geometry
0,7270.352618,0.492187,0.507813,0.007894,0.057267,0.137753,0.030203,0.058145,0.446264,0.123359,...,0.0,0.0,2.0,10.0,1.0,0.0,5.0,6.0,4.0,"POLYGON ((399848.954 5815619.506, 399854.727 5..."
1,9797.460933,0.495604,0.504396,0.010169,0.058263,0.089598,0.024142,0.057224,0.600788,0.089757,...,0.0,4.0,2.0,26.0,6.0,4.0,4.0,4.0,5.0,"POLYGON ((393175.012 5821135.577, 393167.174 5..."
2,2037.887335,0.470872,0.529128,0.007609,0.053591,0.083072,0.025393,0.067306,0.355233,0.131022,...,0.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,4.0,"POLYGON ((388394.110 5807792.070, 388379.276 5..."


In [6]:
df_previous.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 542 entries, 0 to 541
Data columns (total 84 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   E_E         542 non-null    float64 
 1   E_EM        542 non-null    float64 
 2   E_EW        542 non-null    float64 
 3   E_EU1       542 non-null    float64 
 4   E_E1U6      542 non-null    float64 
 5   E_E6U15     542 non-null    float64 
 6   E_E15U18    542 non-null    float64 
 7   E_E18U25    542 non-null    float64 
 8   E_E25U55    542 non-null    float64 
 9   E_E55U65    542 non-null    float64 
 10  E_E65U80    542 non-null    float64 
 11  E_E80U110   542 non-null    float64 
 12  MH_E        542 non-null    float64 
 13  MH_EM       542 non-null    float64 
 14  MH_EW       542 non-null    float64 
 15  MH_U1       542 non-null    float64 
 16  MH_1U6      542 non-null    float64 
 17  MH_6U15     542 non-null    float64 
 18  MH_15U18    542 non-null    float64 
 19  

In [34]:
#import the cleaned df: shortcut
df = get_final_data("../raw_data/final_data", "full.shp")

In [35]:
df.columns

Index(['E_E18U25', 'MH_E', 'angebotsmi', 'anteil_soz', 'anteil_sta',
       'entwicklun', 'wohndauer', 'wohnungsum', 'wohnungs_1', 'Kinderarmu',
       'MH_rate', 'area', 'pop_dens', 'laerm', 'luft', 'gruen', 'bio',
       'mobility_p', 'spaces_gre', 'spaces_wat', 'social_com', 'social_cul',
       'social_eat', 'social_nig', 'economic', 'education', 'health_care',
       'public_service', 'geometry', 'E_U18', 'E_E25U65', 'E_E65U110',
       'B_1940', 'B_1941_1990', 'B_1991_2000'],
      dtype='object')

In [36]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 536 entries, 11501341 to 11300616
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   E_E18U25        536 non-null    float64 
 1   MH_E            536 non-null    float64 
 2   angebotsmi      536 non-null    float64 
 3   anteil_soz      536 non-null    float64 
 4   anteil_sta      536 non-null    float64 
 5   entwicklun      536 non-null    float64 
 6   wohndauer       536 non-null    float64 
 7   wohnungsum      536 non-null    float64 
 8   wohnungs_1      536 non-null    float64 
 9   Kinderarmu      536 non-null    float64 
 10  MH_rate         536 non-null    float64 
 11  area            536 non-null    float64 
 12  pop_dens        536 non-null    float64 
 13  laerm           536 non-null    int64   
 14  luft            536 non-null    int64   
 15  gruen           536 non-null    int64   
 16  bio             536 non-null    int64   
 

In [89]:
#These code below are from the "data_prep_pipe.py"
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from termcolor import colored
import mlflow
from memoized_property import memoized_property
from mlflow.tracking import MlflowClient
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score

EXPERIMENT_NAME = "Batch_874_Youth_in_the_city_spatial_regression"
yourname = "Batch_874_Safiaaaaa"
MLFLOW_URI = "https://mlflow.lewagon.ai/"

class Trainer(object):
    def __init__(self, X, y):
        """
            X: pandas DataFrame --> drop the "geometry and the "Kinderarmut"
            y: pandas Series --> "Kinderarmut"
        """
        self.pipeline = None
        self.X = X
        self.y = y
        # for MLFlow
        self.experiment_name = EXPERIMENT_NAME

    def set_experiment_name(self, experiment_name):
        '''defines the experiment name for MLFlow'''
        self.experiment_name = experiment_name

    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        preproc_pipe = Pipeline([
            ('knnimputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
        self.pipeline = Pipeline([
            ('preproc', preproc_pipe),
            ('linear_model', LinearRegression())
        ])

    def run(self):
        self.set_pipeline()
        self.mlflow_log_param("model", "Linear")
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the r2 score"""
        y_pred = self.pipeline.predict(X_test)
        r2_score = r2_score(y_test, y_pred)
        self.mlflow_log_metric("r2-score", r2_score)
        return round(r2_score, 2)

    def save_model_locally(self):
        """Save the model into a .joblib format"""
        joblib.dump(self.pipeline, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))

    # MLFlow methods
    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)


"""if __name__ == "__main__":
    # Get and clean data
    df = pd.read_csv("../raw_data/final_data/final_data.csv")
    y = df["Kinderarmu"]
    X = df.drop(columns=["Kinderarmu", "geometry"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # Train and save model, locally and
    trainer = Trainer(X=X_train, y=y_train)
    trainer.set_experiment_name('xp2')
    trainer.run()
    r2_score = trainer.evaluate(X_test, y_test)
    print(f"r2_score: {r2_score}")
    trainer.save_model_locally()"""


'if __name__ == "__main__":\n    # Get and clean data\n    df = pd.read_csv("../raw_data/final_data/final_data.csv")\n    y = df["Kinderarmu"]\n    X = df.drop(columns=["Kinderarmu", "geometry"])\n    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n    # Train and save model, locally and\n    trainer = Trainer(X=X_train, y=y_train)\n    trainer.set_experiment_name(\'xp2\')\n    trainer.run()\n    r2_score = trainer.evaluate(X_test, y_test)\n    print(f"r2_score: {r2_score}")\n    trainer.save_model_locally()'

In [90]:
y = df["Kinderarmu"]
X = df.drop(columns=["Kinderarmu", "geometry"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Train and save model, locally and
trainer = Trainer(X=X_train, y=y_train)

In [91]:
X_train

Unnamed: 0_level_0,E_E18U25,MH_E,angebotsmi,anteil_soz,anteil_sta,entwicklun,wohndauer,wohnungsum,wohnungs_1,MH_rate,...,economic,education,health_care,public_service,E_U18,E_E25U65,E_E65U110,B_1940,B_1941_1990,B_1991_2000
PLR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2300418,498.904081,3925.767983,12.689999,7.390000,9.339999,-2.310000,68.169995,16.019999,90.739994,0.497225,...,3.0,12.0,6.0,5.0,1117.650729,5497.447822,781.352423,223.0,62.0,15.0
5200420,436.276381,1808.968477,8.819623,10.447953,19.866688,3.561325,67.088401,3.201744,15.609269,0.312088,...,1.0,2.0,1.0,0.0,844.083822,3032.410523,1483.561925,388.0,1303.0,312.0
8100101,234.271554,2208.203522,11.523231,6.728770,9.544801,-2.015047,62.900546,14.062745,98.777271,0.575629,...,,,,,495.721244,2711.318227,394.843223,119.0,25.0,0.0
12601235,1003.175039,6145.044104,7.579392,1.560794,42.011854,3.406115,69.862274,0.687878,11.065842,0.577522,...,0.0,4.0,1.0,4.0,2748.927406,5124.187596,1764.079845,35.0,326.0,47.0
9301025,651.697241,1565.411361,11.560000,8.940000,23.249999,8.970000,67.939998,0.970000,4.040000,0.139878,...,1.0,0.0,4.0,12.0,1830.526913,6140.144246,2568.885328,1294.0,700.0,1463.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11501341,422.736458,1094.074159,9.750001,1.300001,24.830000,10.270005,62.319997,5.910001,43.489997,0.150484,...,0.0,5.0,6.0,4.0,1694.839276,4141.363733,1011.413151,621.0,73.0,505.0
9401534,485.820020,1399.553929,8.471536,2.409572,25.715453,4.215901,69.114083,4.670153,17.931040,0.204542,...,2.0,3.0,4.0,4.0,1343.650426,4050.920061,961.990490,234.0,174.0,100.0
4500939,456.790296,2997.613245,12.900000,3.140001,3.930002,3.030000,65.660001,27.589994,101.870000,0.460351,...,1.0,4.0,10.0,7.0,918.554572,3762.822137,1373.412512,232.0,163.0,19.0
1100205,227.331611,1596.278069,16.168515,1.852791,16.666337,11.294516,56.716231,15.384517,78.390901,0.619027,...,14.0,0.0,8.0,7.0,299.944519,1606.672330,444.741513,8.0,81.0,56.0


In [84]:
X.isna().sum()

E_E18U25           0
MH_E               0
angebotsmi         0
anteil_soz         0
anteil_sta         0
entwicklun         0
wohndauer          0
wohnungsum         0
wohnungs_1         0
MH_rate            0
area               0
pop_dens           0
laerm              0
luft               0
gruen              0
bio                0
mobility_p        13
spaces_gre        13
spaces_wat        13
social_com        13
social_cul        13
social_eat        13
social_nig        13
economic          13
education         13
health_care       13
public_service    13
E_U18              0
E_E25U65           0
E_E65U110          0
B_1940             0
B_1941_1990        0
B_1991_2000        0
dtype: int64

In [60]:
y.isna().sum()

0

In [92]:
trainer.set_pipeline()

In [93]:
trainer.pipeline

In [94]:
trainer.run()

In [95]:
trainer.pipeline.transform(X_train)

AttributeError: This 'Pipeline' has no attribute 'transform'

In [72]:
X.isna().sum()

E_E18U25           0
MH_E               0
angebotsmi         0
anteil_soz         0
anteil_sta         0
entwicklun         0
wohndauer          0
wohnungsum         0
wohnungs_1         0
MH_rate            0
area               0
pop_dens           0
laerm              0
luft               0
gruen              0
bio                0
mobility_p        13
spaces_gre        13
spaces_wat        13
social_com        13
social_cul        13
social_eat        13
social_nig        13
economic          13
education         13
health_care       13
public_service    13
E_U18              0
E_E25U65           0
E_E65U110          0
B_1940             0
B_1941_1990        0
B_1991_2000        0
dtype: int64

In [88]:
r2_score = trainer.evaluate(X_test, y_test)
print(f"r2_score: {r2_score}")

UnboundLocalError: local variable 'r2_score' referenced before assignment

In [77]:
#try without the trainer class
preproc_pipe = Pipeline([
            ('knnmputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
pipe_test = Pipeline([
            ('preproc', preproc_pipe),
            ('linear_model', LinearRegression())
        ])
pipe_test

In [75]:
pipe.fit(X_train, y_train)
pd.DataFrame(pipe.transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,-0.616430,-0.012273,1.482777,0.713643,-0.653392,0.687218,-1.230821,0.546345,0.858568,1.497644,...,0.000000,-0.666667,-0.428571,-0.75,-0.506330,-0.876708,-0.673902,-0.772575,-0.349146,-0.263345
1,0.509983,1.056218,-0.241302,0.550665,0.004966,-0.804754,-0.783839,0.375052,0.581714,1.271538,...,2.000000,1.000000,0.857143,-0.50,-0.016396,0.230248,-0.292981,0.204013,-0.375712,-0.291815
2,0.960971,0.135391,-0.682039,1.362102,0.009956,0.022738,0.728577,-0.183957,-0.321296,-0.450412,...,0.666667,1.333333,-0.142857,1.00,0.767144,0.447455,1.484280,0.672241,6.812144,2.106762
3,0.632361,0.686028,-0.786852,-0.521769,2.295543,0.459984,0.883953,-0.649345,-0.447210,-0.103470,...,0.666667,0.000000,0.428571,0.00,1.397576,0.923525,1.140255,-0.816054,-0.129032,-0.334520
4,-0.853873,-0.266650,2.055785,-0.259638,0.091674,2.249309,-1.657267,0.935338,0.725022,1.366222,...,4.333333,-0.666667,0.571429,0.50,-1.118297,-0.921121,-0.748157,-0.789298,-0.436433,0.064057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,0.126574,-0.354163,-0.188805,-0.370788,0.587954,1.943337,-0.667482,-0.125362,0.194095,-0.912191,...,0.000000,0.000000,0.571429,1.00,1.332619,0.756620,0.785418,1.909699,0.091082,2.163701
371,-0.611976,-0.050249,0.972729,0.315658,-0.508249,-0.119217,-0.235418,1.322520,1.033321,1.258546,...,0.000000,0.333333,-0.142857,0.50,-0.989815,-0.638031,-0.783120,-0.628763,-0.629981,-0.256228
372,-0.147122,0.649286,1.484495,0.710728,-0.651139,0.691926,-1.232101,0.547515,0.858160,1.618559,...,0.000000,-0.333333,-0.428571,-1.25,-0.404913,-0.117727,-0.897675,-0.675585,-0.705882,-0.270463
373,1.311031,1.296780,-0.713392,0.196569,0.146538,0.504830,-0.183168,-0.174634,-0.119396,1.006332,...,1.000000,1.666667,1.571429,0.25,0.596607,0.430243,0.139638,0.622074,0.368121,-0.241993


In [78]:
pipe_test.fit(X_train, y_train)

In [79]:
X_train

Unnamed: 0_level_0,E_E18U25,MH_E,angebotsmi,anteil_soz,anteil_sta,entwicklun,wohndauer,wohnungsum,wohnungs_1,MH_rate,...,economic,education,health_care,public_service,E_U18,E_E25U65,E_E65U110,B_1940,B_1941_1990,B_1991_2000
PLR_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1200627,296.753398,2122.799833,14.529999,6.690000,4.410000,6.070000,59.129996,11.909999,87.169994,0.647075,...,1.0,0.0,1.0,2.0,733.991406,1720.990416,528.871326,13.0,104.0,10.0
1401048,626.086669,4334.423256,9.599999,5.880000,15.239999,1.080000,61.659995,10.379999,68.969995,0.598819,...,7.0,5.0,10.0,3.0,1081.484638,4570.266254,960.452169,305.0,97.0,6.0
7601442,757.943722,2428.443549,8.339713,9.912842,15.322078,3.847600,70.220545,5.386883,9.607400,0.231311,...,3.0,6.0,3.0,9.0,1637.222607,5129.351820,2974.075336,445.0,1991.0,343.0
11100307,661.866849,3568.184255,8.040000,0.550000,52.920002,5.310000,71.100002,1.230000,1.330000,0.305357,...,3.0,2.0,7.0,5.0,2084.366518,6354.742526,2584.297183,0.0,162.0,0.0
1100205,227.331611,1596.278069,16.168515,1.852791,16.666337,11.294516,56.716231,15.384517,78.390901,0.619027,...,14.0,0.0,8.0,7.0,299.944519,1606.672330,444.741513,8.0,81.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11501340,513.988072,1415.137219,9.750113,1.300375,24.830151,10.271173,62.318596,5.910261,43.488493,0.132756,...,1.0,2.0,8.0,9.0,2038.294562,5925.135210,2182.270257,815.0,220.0,351.0
4300620,298.055599,2044.196248,13.071519,4.712013,6.797611,3.372823,64.764158,18.842854,98.658064,0.596046,...,1.0,3.0,3.0,7.0,391.072146,2335.339291,405.128015,56.0,30.0,11.0
1200624,433.966598,3492.131780,14.534911,6.675510,4.447061,6.085744,59.122748,11.920449,87.143214,0.672882,...,1.0,1.0,1.0,0.0,805.923184,3674.587350,275.338299,42.0,10.0,9.0
12200309,860.291911,4832.352947,8.250058,4.120140,17.568862,5.459988,65.059899,5.470160,22.880043,0.542217,...,4.0,7.0,15.0,6.0,1516.266639,5085.047262,1450.605529,430.0,293.0,13.0


In [80]:
y_pred = pipe_test.predict(X_test)
r2_score = r2_score(y_test, y_pred)
print(round(r2_score, 2))

0.67
