# 05 Data Preprocessing & Modelling - Using Pipelines

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
kiva_final = pd.read_csv('data/kiva_hh.csv')

In [5]:
## Delete irrelevant columns:
kiva_model = kiva_final.drop(['activity',  'date', 'borrower_genders', 'country_code',  'hv001', 'hv270',
                              'lat', 'lon', 'posted_time', 'disbursed_time', 'Level', 'asdf_id',
                              'loan_amount_usd'
                              #'partner_id', 'dist_coast', 'avg_pre',
                              #'avg_temp', 'pop_dens', 'conflict_deaths'
                                                         ], axis=1)
kiva_model.head(3)

Unnamed: 0,sector,region,partner_id,term_in_months,lender_count,repayment_interval,year,funded_amount_usd,gender,hv271,avg_pre,avg_temp,dist_coast,pop_dens,conflict_deaths,nightlite
0,Food,Bagamoyo,379.0,8.0,6.0,irregular,2015.0,0.138094,female,128721.0,72.890916,26.408804,49414.959752,25.266386,0.0,0.192784
1,Retail,Bagamoyo,379.0,14.0,11.0,monthly,2016.0,0.218182,female,128721.0,72.890916,26.408804,49414.959752,25.266386,0.0,0.192784
2,Clothing,Bagamoyo,379.0,8.0,10.0,irregular,2014.0,0.287316,female,128721.0,72.890916,26.408804,49414.959752,25.266386,0.0,0.192784


In [6]:
### Converting partner_id from numeric to object:
kiva_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90044 entries, 0 to 90043
Data columns (total 16 columns):
sector                90044 non-null object
region                90044 non-null object
partner_id            90044 non-null float64
term_in_months        90044 non-null float64
lender_count          90044 non-null float64
repayment_interval    90044 non-null object
year                  90044 non-null float64
funded_amount_usd     90044 non-null float64
gender                90044 non-null object
hv271                 90044 non-null float64
avg_pre               90044 non-null float64
avg_temp              90044 non-null float64
dist_coast            90044 non-null float64
pop_dens              90044 non-null float64
conflict_deaths       90044 non-null float64
nightlite             90044 non-null float64
dtypes: float64(12), object(4)
memory usage: 11.0+ MB


In [6]:
### partner id should be a categorical varialbe:
kiva_model['partner_id'] = kiva_model['partner_id'].astype(object)

In [15]:
kiva_model.columns

Index(['sector', 'region', 'partner_id', 'term_in_months', 'lender_count',
       'repayment_interval', 'year', 'funded_amount_usd', 'gender', 'hv271',
       'avg_pre', 'avg_temp', 'dist_coast', 'pop_dens', 'conflict_deaths',
       'nightlite'],
      dtype='object')

## 5.1 First - Regression based on individual region (each region is one observation; n=380)

In [32]:
# get the mean for all numerica variables:
kiva_regional_num = kiva_model.groupby("region").mean()
# get the mode for the categorical variables:
## select all the categorical variables:
kiva_regional_cat = kiva_model.select_dtypes(include=['object'])
## get the mode:
kiva_regional_cat = kiva_regional_cat.groupby("region").agg(lambda x:x.value_counts().index[0])
## merge them together
kiva_regional = pd.merge(kiva_regional_num,
                        kiva_regional_cat,
                        on='region',
                        how='left',
                        validate='1:1')
kiva_regional.head()

Unnamed: 0_level_0,term_in_months,lender_count,year,funded_amount_usd,hv271,avg_pre,avg_temp,dist_coast,pop_dens,conflict_deaths,nightlite,sector,partner_id,repayment_interval,gender
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AHERO,12.5,9.5,2017.0,2.901061,32972.0,148.403614,22.593334,650122.832539,275.565545,0.0,0.495094,Transportation,133.0,monthly,male
APAC,12.0,8.5,2016.0,0.120611,-54954.0,115.563147,24.615531,993336.427981,78.974432,0.0,0.311204,Food,222.0,monthly,female
Abaita Ababiri,14.0,24.642857,2014.714286,0.327104,109429.0,152.414997,23.240448,927067.057323,528.374088,0.0,1.393152,Retail,65.0,monthly,female
"Aldina, Jomvu",10.0,7.333333,2017.0,2.417551,63588.0,85.008673,27.194044,5544.49933,3325.450333,0.0,16.420256,Food,133.0,monthly,female
"Aldina,Jomvu",11.0,6.0,2017.0,1.45053,63588.0,85.008673,27.194044,5544.49933,3325.450333,0.0,16.420256,Clothing,133.0,monthly,female


### Building Preprocessing Pipeline

In [35]:
## Split the dataset in train and test:

X=kiva_regional.drop('hv271', axis=1)
y=kiva_regional['hv271']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [36]:
type(y_train)

pandas.core.series.Series

In [37]:
### 1st step - scale the target variable (y) manually:
### Take min max scaler in order to ensure that the MPI is between 0 and 1:
from sklearn.preprocessing import MinMaxScaler
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()
scaler = MinMaxScaler()
scaler.fit(y_train.reshape(-1,1))
y_train = scaler.transform(y_train.reshape(-1,1))
y_test = scaler.transform(y_test.reshape(-1,1))


In [38]:
### Create transformers based on the type of data (numeric vs categorical)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='median')),
    ('sclaer', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [45]:
### Splitting into numeric and categorical features:
numeric_features = kiva_regional.select_dtypes(include=['int64', 'float64']).drop(['hv271'],axis=1).columns
categorical_features = kiva_regional.select_dtypes(include=['object']).columns

### Initiate the preprocessing pipeline:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [46]:
categorical_features

Index(['sector', 'repayment_interval', 'gender'], dtype='object')

### Building a model selection pipeline:

#### Model selection

In [73]:
#Regression Models:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb


#Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error

#Building the pipeline:
## Adding all relevant regression models:
regressors = [
    LinearRegression(),
    DecisionTreeRegressor(max_depth=5, random_state=41),
    RandomForestRegressor(max_depth=5, random_state=41),
    linear_model.Lasso(alpha=1.0, random_state=41),
    Ridge(alpha=1.0),
    svm.SVR(), # Support Vector Regression
    AdaBoostRegressor(random_state=41),
    xgb.XGBRegressor(random_state=41) 
]

#Making predictions:
for regressor in regressors:
    pipe=Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', regressor)])
    pipe.fit(X_train, np.ravel(y_train))
    print('Model: %s' %regressor)
    y_pred=pipe.predict(X_test)
    print('----------------------')
    print('Results on Test-Data:')
    print('Mean squared error: %.5f' % mean_squared_error(y_test, y_pred), '\n'
      'R2: %.3f' % r2_score(y_test, y_pred))
    print('----------------------')
    print('Results on Train-Data (overfitting):')
    y_over=pipe.predict(X_train)
    print('Mean squared error: %.5f' % mean_squared_error(y_train, y_over), '\n'
      'R2: %.3f' % r2_score(y_train, y_over))
    print('')
    print('---------------------------------------------------')
    

Model: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
----------------------
Results on Test-Data:
Mean squared error: 0.04023 
R2: 0.128
----------------------
Results on Train-Data (overfitting):
Mean squared error: 0.04327 
R2: 0.224

---------------------------------------------------
Model: DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=41, splitter='best')
----------------------
Results on Test-Data:
Mean squared error: 0.03992 
R2: 0.135
----------------------
Results on Train-Data (overfitting):
Mean squared error: 0.02777 
R2: 0.502

---------------------------------------------------
Model: RandomForestRegressor(bootstrap

### Deeper Look: Regression Tree models:

#### Decision Tree Regression:

In [61]:
reg_dt = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', DecisionTreeRegressor(random_state=41))])
reg_dt.fit(X_train, y_train)
y_pred = reg_dt.predict(X_test)
print('Results on Test-Data:')
print('Mean squared error: %.5f' % mean_squared_error(y_test, y_pred), '\n'
      'R2: %.3f' % r2_score(y_test, y_pred))
print('Results on Train-Data (overfitting):')
print('Mean squared error: %.5f' % mean_squared_error(y_train, y_over), '\n'
      'R2: %.3f' % r2_score(y_train, y_over))

Results on Test-Data:
Mean squared error: 0.07639 
R2: -0.655
Results on Train-Data (overfitting):
Mean squared error: 0.01105 
R2: 0.802


#### Random Forest:

In [64]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor(random_state=41))])
rf.fit(X_train, np.ravel(y_train))
y_pred = rf.predict(X_test)
print('Results on Test-Data:')
print('Mean squared error: %.5f' % mean_squared_error(y_test, y_pred), '\n'
      'R2: %.3f' % r2_score(y_test, y_pred))
print('Results on Train-Data (overfitting):')
print('Mean squared error: %.5f' % mean_squared_error(y_train, y_over), '\n'
      'R2: %.3f' % r2_score(y_train, y_over))

Results on Test-Data:
Mean squared error: 0.03053 
R2: 0.339
Results on Train-Data (overfitting):
Mean squared error: 0.01105 
R2: 0.802


#### Ada Regression:

In [63]:
ada = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', AdaBoostRegressor(random_state=41)),
                    ])
ada.fit(X_train, np.ravel(y_train))
y_pred = ada.predict(X_test)
print('Results on Test-Data:')
print('Mean squared error: %.5f' % mean_squared_error(y_test, y_pred), '\n'
      'R2: %.3f' % r2_score(y_test, y_pred))
print('Results on Train-Data (overfitting):')
print('Mean squared error: %.5f' % mean_squared_error(y_train, y_over), '\n'
      'R2: %.3f' % r2_score(y_train, y_over))

Results on Test-Data:
Mean squared error: 0.03269 
R2: 0.292
Results on Train-Data (overfitting):
Mean squared error: 0.01105 
R2: 0.802


In [74]:
xgb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb.XGBRegressor(random_state=41))])
                       
xgb.fit(X_train, np.ravel(y_train))
y_pred = xgb.predict(X_test)
print('Results on Test-Data:')
print('Mean squared error: %.5f' % mean_squared_error(y_test, y_pred), '\n'
      'R2: %.3f' % r2_score(y_test, y_pred))
print('Results on Train-Data (overfitting):')
print('Mean squared error: %.5f' % mean_squared_error(y_train, y_over), '\n'
      'R2: %.3f' % r2_score(y_train, y_over))                                        

Results on Test-Data:
Mean squared error: 0.03101 
R2: 0.328
Results on Train-Data (overfitting):
Mean squared error: 0.01105 
R2: 0.802


#### Hyperparameter tuning:

In [57]:
from sklearn.model_selection import GridSearchCV
### Decision Tree Regression
param_grid = { 
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__max_depth' : [1,2,3,4,5]}

CV = GridSearchCV(reg_dt, param_grid, n_jobs= 1)
                  
CV.fit(X_train, np.ravel(y_train))  
print(CV.best_params_)    
print(CV.best_score_)

{'regressor__max_depth': 1, 'regressor__max_features': 'auto'}
0.21320743232864786


In [66]:
### Random Forest
param_grid = { 
    'regressor__n_estimators': [75, 100, 150, 200, 250, 300, 350],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__max_depth' : [8,9,10,11,12,13,14]}

CV = GridSearchCV(rf, param_grid, n_jobs= 1)
                  
CV.fit(X_train, np.ravel(y_train))  
print(CV.best_params_)    
print(CV.best_score_)

{'regressor__max_depth': 12, 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 250}
0.32793187943983126


In [60]:
### ADA Regression:
param_grid = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
 }

CV = GridSearchCV(ada, param_grid, n_jobs= 1)
                  
CV.fit(X_train, np.ravel(y_train))  
print(CV.best_params_)    
print(CV.best_score_)

ValueError: Invalid parameter learning_rate for estimator Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('sclaer',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  Index(['term_in_months', 'lender_count', 'year', 'funded_amount_usd',
       'avg_pre', 'avg_t...
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=True))],
                                                           verbose=False),
                                                  Index(['sector', 'repayment_interval', 'gender'], dtype='object'))],
                                   verbose=False)),
                ('regressor',
                 AdaBoostRegressor(base_estimator=None, learning_rate=1.0,
                                   loss='linear', n_estimators=50,
                                   random_state=41))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [77]:
### XGB Regression:
param_grid = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
 }

CV = GridSearchCV(xgb, param_grid, n_jobs= 1)
                  
CV.fit(X_train, np.ravel(y_train))  
print(CV.best_params_)    
print(CV.best_score_)

ValueError: Invalid parameter learning_rate for estimator Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('sclaer',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  Index(['term_in_months', 'lender_count', 'year', 'funded_amount_usd',
       'avg_pre', 'avg_t...
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
                              importance_type='gain', learning_rate=0.1,
                              max_delta_step=0, max_depth=3, min_child_weight=1,
                              missing=None, n_estimators=100, n_jobs=1,
                              nthread=None, objective='reg:linear',
                              random_state=41, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, seed=None, silent=None,
                              subsample=1, verbosity=1))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

**---------------------------------------------------------------------------------**

### Fitting Random Forest Regression:

In [73]:
## Fitting a Decision Tree Regressor:
from sklearn.ensemble import RandomForestRegressor
rf_reg = Pipeline(steps=[('preprocesser', preprocessor),
                     ('regression', RandomForestRegressor(max_depth=10, random_state=41))])

In [74]:
rf_reg.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params)


Pipeline(memory=None,
         steps=[('preprocesser',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [75]:
y_pred = rf_reg.predict(X_test)
print('Mean squared error: %.5f' % mean_squared_error(y_test, y_pred), '\n'
      'Median absolute error: %5f' %median_absolute_error(y_test,y_pred), '\n'
      'R2: %.3f' %r2_score(y_test, y_pred)#, '\n'
      #'Parameters: %' %dt_reg.get_params()
     )

Mean squared error: 0.03091 
Median absolute error: 0.134104 
R2: 0.331


In [150]:
##### Getting column names after preprocessing step (in pipeline):
onehot_columns = list(rf_reg.named_steps['preprocesser'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=categorical_features))
numeric_features_list = list(numeric_features)
numeric_features_list.extend(onehot_columns)

In [151]:
##### Getting feature importance out of the Random Forest Regression:
importances = rf_reg['regression'].feature_importances_
importances = pd.DataFrame(importances,  index=numeric_features_list, columns=['importances'])
importances.sort_values(by='importances', ascending=True).plot(kind='barh', figsize=(20,len(importances)/2));

ValueError: Shape of passed values is (367, 1), indices imply (366, 1)