Prediction model using pipeline structure.

The data i'm using was cleaned using this notebook : https://github.com/RaphaelRoriz/Machine_learning/blob/master/housesPricesDataset/House%20Prices%20regressions.ipynb


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
houses_train = pd.read_pickle('houses_train_cleaned.pkl')
houses_train.drop('Id',axis =1,inplace =True )

In [3]:
columns_with_null_values = houses_train.columns[houses_train.isna().any()].tolist()

#getting the name and total number of null values per column
columns_with_null_values_sum = {}
for column_name in columns_with_null_values:
    columns_with_null_values_sum[column_name] = houses_train[column_name].isnull().sum()
    
columns_with_null_values_sum

{}

In [4]:
y = houses_train['SalePrice']
X = houses_train.drop(['SalePrice'],axis = 1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [6]:
pipe = Pipeline(steps=[
                       ("pca", PCA()),
                       ("regression",RandomForestRegressor(n_estimators=100))])

In [7]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regression', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_...tors=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [8]:
metrics = ["explained_variance",
            "neg_mean_squared_error",
            "r2"]

In [None]:
kfold = KFold(n_splits=3,shuffle=True,random_state=116)

In [9]:
cross_validation_scores = cross_validate(pipe, X, y, 
               return_train_score=False,
               scoring=metrics,cv=kfold)

In [10]:
#calculate rmse metric
cross_validation_scores['test_neg_mean_squared_error']  = np.sqrt(cross_validation_scores['test_neg_mean_squared_error'] * -1)

In [11]:
cross_validation_scores

{'fit_time': array([12.09866261, 11.51861215, 11.61113596, 11.62515855, 11.28894377]),
 'score_time': array([0.02954698, 0.02882552, 0.03389359, 0.02907586, 0.02895427]),
 'test_explained_variance': array([0.76424088, 0.79532368, 0.78854796, 0.81624767, 0.7261823 ]),
 'test_neg_mean_squared_error': array([36131.83894433, 36736.52491516, 40044.53770772, 30590.04284735,
        42973.95498107]),
 'test_r2': array([0.7640345 , 0.79460659, 0.78838642, 0.81528276, 0.72458478])}

In [13]:
parameters= {
    'regression__n_estimators': [100,500,900],
    'regression__max_depth':[3,5,10,15],
    'regression__criterion': ['mse'],
    'regression__bootstrap':['True','False'],
    'regression__min_samples_split':[2,3,4],
    'regression__min_samples_leaf':[2,3,4]
    }

In [14]:
grid = GridSearchCV(pipe, param_grid=parameters,scoring= make_scorer(mean_squared_error,greater_is_better=False), cv=5, n_jobs=2,verbose = 2)

In [15]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  3.9min


KeyboardInterrupt: 