## Use KNN model

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression
import seaborn as sns
import joblib
from termcolor import colored
import mlflow
from memoized_property import memoized_property
from mlflow.tracking import MlflowClient
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score

In [77]:
df = pd.read_csv("../YouthInTheCity/data/reduced_df.csv")
df.set_index("PLR_ID")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   PLR_ID      536 non-null    int64  
 1   culture     536 non-null    float64
 2   outdoor_fa  536 non-null    float64
 3   outdoor_le  536 non-null    float64
 4   mig_rate    536 non-null    float64
 5   ave_rent    536 non-null    float64
 6   social_hou  536 non-null    float64
 7   public_hou  536 non-null    float64
 8   child_pov   536 non-null    float64
 9   vegpm20     536 non-null    float64
 10  secon_sch   536 non-null    float64
 11  kita        536 non-null    float64
 12  stations    536 non-null    float64
dtypes: float64(12), int64(1)
memory usage: 54.6 KB


## Gridsearch with KNN regressor

In [91]:
pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('model', KNeighborsRegressor())
])

In [92]:
#use get_paraam to find the specific name of the step for grid/pipe
pipe.get_params()

{'memory': None,
 'steps': [('scaler', RobustScaler()), ('model', KNeighborsRegressor())],
 'verbose': False,
 'scaler': RobustScaler(),
 'model': KNeighborsRegressor(),
 'scaler__copy': True,
 'scaler__quantile_range': (25.0, 75.0),
 'scaler__unit_variance': False,
 'scaler__with_centering': True,
 'scaler__with_scaling': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [104]:
# Grid search KNNImputer parameter n_neighbors
grid_search = GridSearchCV(
    pipe,
    param_grid={
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree'],
        'model__weights': ['uniform', 'distance'],
        'model__n_neighbors': [3,4,5,6,7,8,9,10],
        'model__n_jobs': [-1]})

In [109]:
grid_search.fit(X, y)

# Cross validate optimal pipeline
cv_score_knn = cross_val_score(grid_search.best_estimator_,
                X, y,cv=5,
                scoring= 'neg_mean_absolute_error').mean()

grid_search.best_params_

{'model__algorithm': 'auto',
 'model__n_jobs': -1,
 'model__n_neighbors': 7,
 'model__weights': 'distance'}

In [158]:
y = df["child_pov"]
X = df.drop(columns=["child_pov"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
knn_7 = KNeighborsRegressor(n_jobs=-1, n_neighbors= 7, weights='distance')
knn_fit = knn_7.fit(X_train, y_train)

In [162]:
knn_fit.set_params

<bound method BaseEstimator.set_params of KNeighborsRegressor(n_jobs=-1, n_neighbors=7, weights='distance')>

In [155]:
y_pred = knn_7.predict(X_test)
r_score = r2_score(y_test, y_pred)
mean_abs_error = mean_absolute_error(y_test, y_pred)
mean_squ_error = mean_squared_error(y_test, y_pred)
print(r_score, mean_abs_error, mean_squ_error)

0.5255900652765044 8.792228905140947 144.91997298969778


In [146]:
cv_score_knn_r = cross_val_score(grid_search.best_estimator_,
                X, y,cv=5,
                scoring= 'r2').mean()
cv_score_knn_rneg_mean_squared_error

0.550502568606335

In [149]:
cv_score_knn_mse = cross_val_score(grid_search.best_estimator_,
                X, y,cv=5,
                scoring= 'neg_mean_squared_error').mean()
cv_score_knn_mse

-121.02395878368591

In [110]:
cv_score_knn

-8.271982132387375

## Gridsearch with SGDRegessor

In [111]:
pipe_sgd = Pipeline([
    ('scaler', RobustScaler()),
    ('model', SGDRegressor())
])

In [112]:
pipe_sgd.get_params()

{'memory': None,
 'steps': [('scaler', RobustScaler()), ('model', SGDRegressor())],
 'verbose': False,
 'scaler': RobustScaler(),
 'model': SGDRegressor(),
 'scaler__copy': True,
 'scaler__quantile_range': (25.0, 75.0),
 'scaler__unit_variance': False,
 'scaler__with_centering': True,
 'scaler__with_scaling': True,
 'model__alpha': 0.0001,
 'model__average': False,
 'model__early_stopping': False,
 'model__epsilon': 0.1,
 'model__eta0': 0.01,
 'model__fit_intercept': True,
 'model__l1_ratio': 0.15,
 'model__learning_rate': 'invscaling',
 'model__loss': 'squared_error',
 'model__max_iter': 1000,
 'model__n_iter_no_change': 5,
 'model__penalty': 'l2',
 'model__power_t': 0.25,
 'model__random_state': None,
 'model__shuffle': True,
 'model__tol': 0.001,
 'model__validation_fraction': 0.1,
 'model__verbose': 0,
 'model__warm_start': False}

In [114]:
# Grid search KNNImputer parameter n_neighbors
grid_search_sgd = GridSearchCV(
    pipe_sgd,
    param_grid={
        'model__penalty': ['l2', 'l1','elasticnet'],
        'model__alpha': [0.001, 0.0001, 0.00001],
        'model__learning_rate': ['invscaling','constant','optimal','adaptive'],
        })
grid_search_sgd.fit(X, y)

# Cross validate optimal pipeline
cv_score_sgd = cross_val_score(grid_search_sgd.best_estimator_,
                X, y,cv=5,
                scoring= 'neg_mean_absolute_error').mean()
cv_score_sgd

-8.022795257341425

In [145]:
# Cross validate optimal pipeline
cv_score_sgd_r = cross_val_score(grid_search_sgd.best_estimator_,
                X, y,cv=5,
                scoring= 'r2').mean()
cv_score_sgd_r

0.61297894950106

In [150]:
cv_score_sgd_mse = cross_val_score(grid_search_sgd.best_estimator_,
                X, y,cv=5,
                scoring= 'neg_mean_squared_error').mean()
cv_score_sgd_mse

-103.48374414786306

In [117]:
grid_search_sgd.best_params_

{'model__alpha': 0.0001,
 'model__learning_rate': 'invscaling',
 'model__penalty': 'l2'}

## Gridsearch with Linear model

In [120]:
pipe_linear = Pipeline([
    ('scaler', RobustScaler()),
    ('model', LinearRegression())
])

In [122]:
pipe_linear.get_params()

{'memory': None,
 'steps': [('scaler', RobustScaler()), ('model', LinearRegression())],
 'verbose': False,
 'scaler': RobustScaler(),
 'model': LinearRegression(),
 'scaler__copy': True,
 'scaler__quantile_range': (25.0, 75.0),
 'scaler__unit_variance': False,
 'scaler__with_centering': True,
 'scaler__with_scaling': True,
 'model__copy_X': True,
 'model__fit_intercept': True,
 'model__n_jobs': None,
 'model__normalize': 'deprecated',
 'model__positive': False}

In [124]:
# Grid search KNNImputer parameter n_neighbors
grid_search_linear = GridSearchCV(
    pipe_linear,
    param_grid={
        'model__fit_intercept': [True, False],
        'model__n_jobs':[ -1]})
grid_search_linear.fit(X, y)

# Cross validate optimal pipeline
cv_score_linear = cross_val_score(grid_search_linear.best_estimator_,
                X, y,cv=5,
                scoring= 'neg_mean_absolute_error').mean()
cv_score_linear

-8.017459153093736

In [147]:
cv_score_linear_r = cross_val_score(grid_search_linear.best_estimator_,
                X, y,cv=5,
                scoring= 'r2').mean()
cv_score_linear_r

0.6125325646608202

In [151]:
cv_score_linear_mse = cross_val_score(grid_search_linear.best_estimator_,
                X, y,cv=5,
                scoring= 'neg_mean_squared_error').mean()
cv_score_linear_mse

-103.52230810783074

In [125]:
grid_search_linear.best_params_

{'model__fit_intercept': True, 'model__n_jobs': -1}

In [142]:
neg_mean = {"Linear model":cv_score_linear, 
            "KNNRegressor": cv_score_knn,
           "SGDRegressor":cv_score_sgd}
neg_mean_error = pd.DataFrame(data = neg_mean, index=[0]).T
neg_mean_error.rename(columns={0:"neg_mean_absolute_error"}, inplace=True)

In [148]:
neg_mean_error["r2_score"] = [cv_score_linear_r, cv_score_knn_r, cv_score_sgd_r]
neg_mean_error

Unnamed: 0,neg_mean_absolute_error,r2_score
Linear model,-8.017459,0.612533
KNNRegressor,-8.271982,0.550503
SGDRegressor,-8.022795,0.612979


In [152]:
neg_mean_error["mse"] = [cv_score_linear_mse, cv_score_knn_mse, cv_score_sgd_mse]
neg_mean_error

Unnamed: 0,neg_mean_absolute_error,r2_score,mse
Linear model,-8.017459,0.612533,-103.522308
KNNRegressor,-8.271982,0.550503,-121.023959
SGDRegressor,-8.022795,0.612979,-103.483744


In [86]:
models = [LinearRegression(), KNeighborsRegressor(), SGDRegressor()]
metrics_df = pd.DataFrame(columns = ["mean_abs_error", "mean_squ_error", "r_score"])
for model in models:
    preproc_pipe = Pipeline([
            ('knnimputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
    pipeline = Pipeline([
            ('preproc', preproc_pipe),
            ('model', model)
        ])
    y = df["child_pov"]
    X = df.drop(columns=["child_pov"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    r_score = r2_score(y_test, y_pred)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    mean_squ_error = mean_squared_error(y_test, y_pred)
    metrics_df = pd.DataFrame(data = [mean_abs_error, mean_squ_error, r_score], axis=1)
metrics_df

TypeError: __init__() got an unexpected keyword argument 'axis'

In [85]:
metrics_dft = pd.DataFrame(columns = ["mean_abs_error", "mean_squ_error", "r_score"])
metrics_dft

Unnamed: 0,mean_abs_error,mean_squ_error,r_score


In [13]:
EXPERIMENT_NAME = "non-spatial_regression"
yourname = "Batch_874_Batch_874_Youth_in_the_city"
MLFLOW_URI = "https://mlflow.lewagon.ai/"
class Trainer(object):
    def __init__(self, X, y):
        """
            X: pandas DataFrame --> drop the "geometry and the "Kinderarmut"
            y: pandas Series --> "Kinderarmut"
        """
        self.pipeline = None
        self.X = X
        self.y = y
        # for MLFlow
        self.experiment_name = EXPERIMENT_NAME

    def set_experiment_name(self, experiment_name):
        '''defines the experiment name for MLFlow'''
        self.experiment_name = experiment_name

    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        preproc_pipe = Pipeline([
            ('knnimputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
        self.pipeline = Pipeline([
            ('preproc', preproc_pipe),
            ('linear_model', LinearRegression())
        ])

    def run(self):
        self.set_pipeline()
        self.mlflow_log_param("model", "Linear")
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the r2 score"""
        y_pred = self.pipeline.predict(X_test)
        r_score = r2_score(y_test, y_pred)
        mean_abs_error = mean_absolute_error(y_test, y_pred)
        mean_squ_error = mean_squared_error(y_test, y_pred)
        self.mlflow_log_metric("mean_abs_error", mean_abs_error)
        metrics_df = pd.DataFrame(data = [mean_abs_error, mean_squ_error, r_score])
        return metrics_df

    def save_model_locally(self):
        """Save the model into a .joblib format"""
        joblib.dump(self.pipeline, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))

    # MLFlow methods
    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)

In [54]:
y = df["child_pov"]
X = df.drop(columns=["child_pov"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Train and save model, locally and
trainer = Trainer(X=X_train, y=y_train)
trainer.set_experiment_name('linear')
trainer.run()
linear = trainer.evaluate(X_test, y_test)
print(linear)
trainer.save_model_locally()

           0
0   7.128310
1  82.027018
2   0.684207
[32mmodel.joblib saved locally[0m


In [15]:
EXPERIMENT_NAME = "non-spatial_regression"
yourname = "Batch_874_Batch_874_Youth_in_the_city"
MLFLOW_URI = "https://mlflow.lewagon.ai/"
class Trainer_KNN(object):
    def __init__(self, X, y):
        """
            X: pandas DataFrame --> drop the "geometry and the "Kinderarmut"
            y: pandas Series --> "Kinderarmut"
        """
        self.pipeline = None
        self.X = X
        self.y = y
        # for MLFlow
        self.experiment_name = EXPERIMENT_NAME

    def set_experiment_name(self, experiment_name):
        '''defines the experiment name for MLFlow'''
        self.experiment_name = experiment_name

    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        preproc_pipe = Pipeline([
            ('knnimputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
        self.pipeline = Pipeline([
            ('preproc', preproc_pipe),
            ('KNN_model', KNeighborsRegressor())
        ])

    def run(self):
        self.set_pipeline()
        self.mlflow_log_param("model", "Linear")
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the r2 score"""
        y_pred = self.pipeline.predict(X_test)
        r_score = r2_score(y_test, y_pred)
        mean_abs_error = mean_absolute_error(y_test, y_pred)
        mean_squ_error = mean_squared_error(y_test, y_pred)
        self.mlflow_log_metric("mean_abs_error", mean_abs_error)
        metrics_df = pd.DataFrame(data = [mean_abs_error, mean_squ_error, r_score])
        return metrics_df

    def save_model_locally(self):
        """Save the model into a .joblib format"""
        joblib.dump(self.pipeline, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))

    # MLFlow methods
    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)

In [16]:
#KNN
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Train and save model, locally and
trainer_knn = Trainer(X=X_train, y=y_train)
trainer_knn.set_experiment_name('KNN')
trainer_knn.run()
knn_reg = trainer_knn.evaluate(X_test, y_test)
print(knn_reg)
trainer_knn.save_model_locally()

           0
0   7.853060
1  95.744934
2   0.646386
[32mmodel.joblib saved locally[0m


In [17]:
EXPERIMENT_NAME = "non-spatial_regression"
yourname = "Batch_874_Batch_874_Youth_in_the_city"
MLFLOW_URI = "https://mlflow.lewagon.ai/"
class Trainer_SGD(object):
    def __init__(self, X, y):
        """
            X: pandas DataFrame --> drop the "geometry and the "Kinderarmut"
            y: pandas Series --> "Kinderarmut"
        """
        self.pipeline = None
        self.X = X
        self.y = y
        # for MLFlow
        self.experiment_name = EXPERIMENT_NAME

    def set_experiment_name(self, experiment_name):
        '''defines the experiment name for MLFlow'''
        self.experiment_name = experiment_name

    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        preproc_pipe = Pipeline([
            ('knnimputer', KNNImputer(missing_values= np.nan)),
            ('robustscaler', RobustScaler())
        ])
        self.pipeline = Pipeline([
            ('preproc', preproc_pipe),
            ('KNN_model', SGDRegressor())
        ])

    def run(self):
        self.set_pipeline()
        self.mlflow_log_param("model", "SGD")
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the r2 score"""
        y_pred = self.pipeline.predict(X_test)
        r_score = r2_score(y_test, y_pred)
        mean_abs_error = mean_absolute_error(y_test, y_pred)
        mean_squ_error = mean_squared_error(y_test, y_pred)
        self.mlflow_log_metric("mean_abs_error", mean_abs_error)
        metrics_df = pd.DataFrame(data = [mean_abs_error, mean_squ_error, r_score])
        return metrics_df

    def save_model_locally(self):
        """Save the model into a .joblib format"""
        joblib.dump(self.pipeline, 'model_sgd.joblib')
        print(colored("model_sgd.joblib saved locally", "green"))

    # MLFlow methods
    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)

In [18]:
#SGD
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Train and save model, locally and
trainer_sgd = Trainer(X=X_train, y=y_train)
trainer_sgd.set_experiment_name('SGD')
trainer_sgd.run()
sgd_reg = trainer_knn.evaluate(X_test, y_test)
print(sgd_reg)
trainer_sgd.save_model_locally()

           0
0   7.947480
1  91.630146
2   0.681859
[32mmodel.joblib saved locally[0m


In [73]:
trainer_knn.summary

AttributeError: 'Trainer' object has no attribute 'summary'

In [38]:
metrics_df = linear.merge(knn_reg)
metrics_df

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [51]:
knn_df = pd.DataFrame(knn_reg)
sgd_df = pd.DataFrame(sgd_reg)
linear_df = pd.DataFrame(linear)

In [55]:
linear

Unnamed: 0,0
0,7.12831
1,82.027018
2,0.684207


In [67]:
metrics_df = pd.concat([linear, knn_df,sgd_df],axis = 1)
metrics_df#.rename(columns = {0:"linear", 0: "KNN1", 2: "SGD" }, inplace=True)

Unnamed: 0,0,0.1,0.2
0,7.12831,7.85306,7.94748
1,82.027018,95.744934,91.630146
2,0.684207,0.646386,0.681859


In [70]:
#metrics_df.rename(columns = ["linear", "KNN", "SGD"], inplace=True)
metrics_df.rename(index={0: "mean_abs_error", 1: "mean_squ_error", 2: "r2_score" }, inplace=True)
metrics_df

Unnamed: 0,0,0.1,0.2
mean_abs_error,7.12831,7.85306,7.94748
mean_squ_error,82.027018,95.744934,91.630146
r2_score,0.684207,0.646386,0.681859


In [72]:
metrics_df.rename(columns = {0:"linear", 0:"KNN", 0:"SGD"}, inplace=True)
metrics_df

Unnamed: 0,2,2.1,2.2
mean_abs_error,7.12831,7.85306,7.94748
mean_squ_error,82.027018,95.744934,91.630146
r2_score,0.684207,0.646386,0.681859


In [None]:
#metrics_df = metrics_df.T
#metrics_df.rename(columns = {0:"mean_abs_error", 1: "mean_squ_error", 2: "r2_score" }, inplace=True)
#metrics_df.set_index("linear", inplace=True)
metrics_df.rename(columns = {"mean_abs_error":linear}, inplace=True)

In [30]:
metrics_df.append(knn_reg.T)

  metrics_df.append(knn_reg.T)


Unnamed: 0,mean_abs_error,mean_squ_error,r2_score,0,1,2
0,7.430035,86.164902,0.681061,,,
0,,,,7.85306,95.744934,0.646386
