This notebook is transform from "data_clean.py"

In [1]:
#All functions were transfered from the notebook "Nicha-data-preproc",
##Don't forget to drop high-corr-features!!!
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

def get_full_data(filepath, filename):
    df = gpd.read_file(f"{filepath}/{filename}")
    return df

def clean_data(df):
    unused_column = "Unnamed: 0"
    if unused_column in df.keys():
        df = df.drop(axis=1, columns=["Unnamed: 0"])
    return df

def change_age_bin(df):
    """Change the bin of Population-Age (E)"""
    elist = ['E_EU1', 'E_E1U6', 'E_E6U15', 'E_E15U18',
                'E_E18U25','E_E25U55','E_E55U65','E_E65U80', 'E_E80U110']
    for e in elist:
        df[e] = df[e] * df["E_E"]
    df['E_U18'] = df['E_EU1'] + df['E_E1U6'] + df['E_E6U15'] + df['E_E15U18']
    df['E_E25U65'] = df['E_E25U55'] + df['E_E55U65']
    df['E_E65U110'] =  df['E_E65U80'] + df['E_E80U110']
    df.drop(columns=['E_E','E_EU1', 'E_E1U6', 'E_E6U15', 'E_E15U18','E_E25U55','E_E55U65','E_E65U80', 'E_E80U110'], inplace=True)
    return df

def change_building_bin(df):
    """Change the bin of Building-Age"""
    df['B_1940'] = df['bis_1900'] + df['x1901_1910'] + df['1911-1920'] + df['1921-1930']+ df['1930-1940']
    df['B_1941_1990'] = df['1941_1950'] + df['1951_1960'] + df['1961-1970'] + df['1971-1980'] + df['1980-1990']
    df['B_1991_2000'] = df['1991-2000'] + df['2001-2010'] + df['2010-2015']
    df.drop(columns=['bis_1900','x1901_1910','1911-1920', '1921-1930', '1930-1940',
    '1941_1950', '1951_1960','1961-1970', '1971-1980','1980-1990',
    '1991-2000', '2001-2010','2010-2015'], inplace=True)
    return df


def features_corr(df):
    """create list of pearson correlation"""
    corr = df.corr()
    corr_df = corr.unstack().reset_index() #Unstack correlation matrix
    corr_df.columns = ["feature_1", "feature_2", "correlation"] #Rename the columns
    corr_df.sort_values(by="correlation", ascending=False, inplace=True)
    corr_df = corr_df[corr_df["feature_1"] != corr_df["feature_2"]] #remove the self-corr
    return corr_df


def get_final_data(filepath, filename):
    """filepath is the direktory  + name of the data,
    and the filename of the data to be cleaned.
    The final data will be save in the same directory with the name --> final_data <--"""
    df = get_full_data(filepath, filename)
    df = clean_data(df)
    df = change_age_bin(df)
    df = change_building_bin(df)
    df.set_index('PLR_ID', inplace = True)
    df.rename(columns={'activities' : "economic",
        'activiti_1' :"education",'activiti_2' : "health_care",'activiti_3': "public_service"}, inplace=True)
    df.drop(columns=['ant_arbeit', 'ant_transf', 'ant_arbe_1', 'ant_tran_1', 'Kinderar_1',
            'aenderung_', 'wohnungsve','E_EM','E_EW', 'MH_EM', 'MH_EW', 'MH_U1', 'MH_1U6',
           'MH_6U15', 'MH_15U18', 'MH_18U25', 'MH_25U55', 'MH_55U65', 'MH_65U80',
           'MH_80U110','anteil_lei','wohnungs_2','Nummer', 'Name', 'EW','BEZ',
            'BZR_ID', 'PGR_ID', 'ew2015', 'index_left','mobility_b', 'mobility_1'], inplace = True)
    #drop the rows that missing y
    df = df[df['Kinderarmu'].notna()]
    #df.to_csv(f"{filepath}/final_data.csv")
    #df.to_file(f"{filepath}/final_data.shp")
    #print(f" See the final data with this shape {df.shape} csv and shp file in {filepath}")
    return df



In [2]:
df_previous = get_full_data("../raw_data/final_data", "full.shp")
df_previous.head(3)

Unnamed: 0,E_E,E_EM,E_EW,E_EU1,E_E1U6,E_E6U15,E_E15U18,E_E18U25,E_E25U55,E_E55U65,...,spaces_wat,social_com,social_cul,social_eat,social_nig,activities,activiti_1,activiti_2,activiti_3,geometry
0,7270.352618,0.492187,0.507813,0.007894,0.057267,0.137753,0.030203,0.058145,0.446264,0.123359,...,0.0,0.0,2.0,10.0,1.0,0.0,5.0,6.0,4.0,"POLYGON ((399848.954 5815619.506, 399854.727 5..."
1,9797.460933,0.495604,0.504396,0.010169,0.058263,0.089598,0.024142,0.057224,0.600788,0.089757,...,0.0,4.0,2.0,26.0,6.0,4.0,4.0,4.0,5.0,"POLYGON ((393175.012 5821135.577, 393167.174 5..."
2,2037.887335,0.470872,0.529128,0.007609,0.053591,0.083072,0.025393,0.067306,0.355233,0.131022,...,0.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,4.0,"POLYGON ((388394.110 5807792.070, 388379.276 5..."


In [3]:
df_previous.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 542 entries, 0 to 541
Data columns (total 84 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   E_E         542 non-null    float64 
 1   E_EM        542 non-null    float64 
 2   E_EW        542 non-null    float64 
 3   E_EU1       542 non-null    float64 
 4   E_E1U6      542 non-null    float64 
 5   E_E6U15     542 non-null    float64 
 6   E_E15U18    542 non-null    float64 
 7   E_E18U25    542 non-null    float64 
 8   E_E25U55    542 non-null    float64 
 9   E_E55U65    542 non-null    float64 
 10  E_E65U80    542 non-null    float64 
 11  E_E80U110   542 non-null    float64 
 12  MH_E        542 non-null    float64 
 13  MH_EM       542 non-null    float64 
 14  MH_EW       542 non-null    float64 
 15  MH_U1       542 non-null    float64 
 16  MH_1U6      542 non-null    float64 
 17  MH_6U15     542 non-null    float64 
 18  MH_15U18    542 non-null    float64 
 19  

In [4]:
#import the cleaned df: shortcut
df = get_final_data("../raw_data/final_data", "full.shp")

In [5]:
df.columns

Index(['E_E18U25', 'MH_E', 'angebotsmi', 'anteil_soz', 'anteil_sta',
       'entwicklun', 'wohndauer', 'wohnungsum', 'wohnungs_1', 'Kinderarmu',
       'MH_rate', 'area', 'pop_dens', 'laerm', 'luft', 'gruen', 'bio',
       'mobility_p', 'spaces_gre', 'spaces_wat', 'social_com', 'social_cul',
       'social_eat', 'social_nig', 'economic', 'education', 'health_care',
       'public_service', 'geometry', 'E_U18', 'E_E25U65', 'E_E65U110',
       'B_1940', 'B_1941_1990', 'B_1991_2000'],
      dtype='object')

In [6]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 536 entries, 11501341 to 11300616
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   E_E18U25        536 non-null    float64 
 1   MH_E            536 non-null    float64 
 2   angebotsmi      536 non-null    float64 
 3   anteil_soz      536 non-null    float64 
 4   anteil_sta      536 non-null    float64 
 5   entwicklun      536 non-null    float64 
 6   wohndauer       536 non-null    float64 
 7   wohnungsum      536 non-null    float64 
 8   wohnungs_1      536 non-null    float64 
 9   Kinderarmu      536 non-null    float64 
 10  MH_rate         536 non-null    float64 
 11  area            536 non-null    float64 
 12  pop_dens        536 non-null    float64 
 13  laerm           536 non-null    int64   
 14  luft            536 non-null    int64   
 15  gruen           536 non-null    int64   
 16  bio             536 non-null    int64   
 

In [7]:
#These code below are from the "data_prep_pipe.py"
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from termcolor import colored
import mlflow
from memoized_property import memoized_property
from mlflow.tracking import MlflowClient
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score

EXPERIMENT_NAME = "Batch_874_Youth_in_the_city_spatial_regression"
yourname = "Batch_874_Safiaaaaa"
MLFLOW_URI = "https://mlflow.lewagon.ai/"

class Trainer(object):
    def __init__(self, X, y):
        """
            X: pandas DataFrame --> drop the "geometry and the "Kinderarmut"
            y: pandas Series --> "Kinderarmut"
        """
        self.pipeline = None
        self.X = X
        self.y = y
        # for MLFlow
        self.experiment_name = EXPERIMENT_NAME

    def set_experiment_name(self, experiment_name):
        '''defines the experiment name for MLFlow'''
        self.experiment_name = experiment_name

    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        preproc_pipe = Pipeline([
            ('knnimputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
        self.pipeline = Pipeline([
            ('preproc', preproc_pipe),
            ('linear_model', LinearRegression())
        ])

    def run(self):
        self.set_pipeline()
        self.mlflow_log_param("model", "Linear")
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the r2 score"""
        y_pred = self.pipeline.predict(X_test)
        r2_score = r2_score(y_test, y_pred)
        self.mlflow_log_metric("r2-score", r2_score)
        return round(r2_score, 2)

    def save_model_locally(self):
        """Save the model into a .joblib format"""
        joblib.dump(self.pipeline, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))

    # MLFlow methods
    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)


"""if __name__ == "__main__":
    # Get and clean data
    df = pd.read_csv("../raw_data/final_data/final_data.csv")
    y = df["Kinderarmu"]
    X = df.drop(columns=["Kinderarmu", "geometry"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # Train and save model, locally and
    trainer = Trainer(X=X_train, y=y_train)
    trainer.set_experiment_name('xp2')
    trainer.run()
    r2_score = trainer.evaluate(X_test, y_test)
    print(f"r2_score: {r2_score}")
    trainer.save_model_locally()"""


'if __name__ == "__main__":\n    # Get and clean data\n    df = pd.read_csv("../raw_data/final_data/final_data.csv")\n    y = df["Kinderarmu"]\n    X = df.drop(columns=["Kinderarmu", "geometry"])\n    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n    # Train and save model, locally and\n    trainer = Trainer(X=X_train, y=y_train)\n    trainer.set_experiment_name(\'xp2\')\n    trainer.run()\n    r2_score = trainer.evaluate(X_test, y_test)\n    print(f"r2_score: {r2_score}")\n    trainer.save_model_locally()'

In [8]:
y = df["Kinderarmu"]
X = df.drop(columns=["Kinderarmu", "geometry"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Train and save model, locally and
trainer = Trainer(X=X_train, y=y_train)
trainer.set_experiment_name('xp2')
trainer.run()

### Perform permutation
from sklearn.inspection import permutation_importance
- fit model 

model = LogisticRegression().fit(X,y)
- Perform permutation

permutation_score = permutation_importance(model, X,y, n_repeats=100)
- Unstack results, put in df

importance_df = pd.DataFrame(np.vstack((X.columns,permutation_score.importances_mean)).T)
importance_df.columns=['feature','feature importance']

- Order by importance

importance_df.sort_values(by="feature importance", ascending = False, inplace=True)
importance_df

**remove feature that have lower feature importance, run the validation score to test the reduced feature**

In [14]:
preproc_pipe = Pipeline([
            ('knnimputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
preproc_pipe

In [15]:
preproc_pipe.fit(X_train, y_train)

In [25]:
X_train_p = preproc_pipe.transform(X_train)
X_train_p

array([[-0.40517295, -0.28613287, -0.80702744, ..., -0.45054945,
        -0.11612903, -0.06225681],
       [-1.10074653, -0.84703955,  0.44560275, ..., -0.1978022 ,
        -0.21935484,  1.71984436],
       [ 1.28602909,  0.53967535, -0.28096314, ...,  1.63736264,
         0.15053763,  0.04669261],
       ...,
       [ 0.37695423,  0.8284259 ,  0.97542789, ..., -0.29304029,
         0.46451613, -0.28793774],
       [-0.50209808, -0.5480839 , -0.51579916, ..., -0.43956044,
        -0.46451613,  4.77042802],
       [-0.27291884, -0.09318455, -0.51579982, ..., -0.63736264,
         1.5655914 ,  0.73929961]])

In [26]:
model = LinearRegression().fit(X_train_p,y_train)

In [27]:
from sklearn.inspection import permutation_importance
permutation_score = permutation_importance(model,X_train_p,y_train, n_repeats=100)

In [28]:
#Unstack results, put in df

importance_df = pd.DataFrame(np.vstack((X.columns,permutation_score.importances_mean)).T)
importance_df.columns=['feature','feature importance']

In [29]:
#Order by importance

importance_df.sort_values(by="feature importance", ascending = False, inplace=True)
importance_df

Unnamed: 0,feature,feature importance
9,MH_rate,0.759103
21,social_eat,0.191432
27,E_U18,0.082637
2,angebotsmi,0.080373
28,E_E25U65,0.079589
7,wohnungsum,0.050294
4,anteil_sta,0.037418
6,wohndauer,0.028366
22,social_nig,0.024979
23,economic,0.023384
