In this notebook i'll use the sklearn pipeline for the ML project development.

The data i'm using was cleaned using this notebook : https://github.com/RaphaelRoriz/Machine_learning/blob/master/housesPricesDataset/House%20Prices%20regressions.ipynb


In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor as xgbr
from sklearn.svm import SVR
from sklearn.linear_model import Lasso

%matplotlib inline

# Functions

In [2]:
def scores_train(model,model_predictions_train):
    print('MSE:', metrics.mean_squared_error(y_train, model_predictions_train))
    print('R2 score:', model.score(X_train,y_train))

In [3]:
def scores_test(model,model_predictions_test):
    print('MSE:', metrics.mean_squared_error(y_test, model_predictions_test))
    print('R2 score:', model.score(X_test,y_test))

In [4]:
def plot_train_test_predictions(model_name,model_predictions_train,model_predictions_test):
    
    plt.scatter(model_predictions_train, y_train, c = "blue",  label = "Training data")
    plt.scatter(model_predictions_test, y_test, c = "black",  label = "Test data")
    plt.title(model_name)
    plt.xlabel("Predicted values")
    plt.ylabel("Real values")
    plt.legend(loc = "upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
    plt.show()

In [5]:
def plot_y_predictions(model_name,model_predictions_train,model_predictions_test):
    
    test_fig = plt.figure()
    plt.plot(y_test,c= 'green',label = 'Y test',marker = 'o',linestyle = 'None')
    plt.plot(model_predictions_test,c= 'red',label = 'Test predictions',marker = 'o',linestyle = 'None')
    plt.title(model_name + '(test)')
    plt.ylabel('SalePrice')
    plt.legend(loc = "upper right")
    plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
    test_fig.show()
    
    train_fig = plt.figure()
    plt.plot(y_train,c= 'green',label = 'Y train',marker = 'o',linestyle = 'None')
    plt.plot(model_predictions_train,c= 'red',label = 'Train predictions',marker = 'o',linestyle = 'None')
    plt.title(model_name+'(Train)')
    plt.ylabel('SalePrice')
    plt.legend(loc = "upper right")
    plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
    train_fig.show()

# Importing data

In [6]:
houses_train = pd.read_pickle('houses_train_cleaned.pkl')
houses_test = pd.read_pickle('houses_test_cleaned.pkl')
houses_train.drop('Id',axis =1,inplace =True )

# Feature selection

I'll select the features based on the correlation with the target 

In [7]:
houses_train_corr = houses_train.corr()

In [8]:
salePrice_corr = houses_train_corr['SalePrice']

In [9]:
relevant_salePrice_corr = []

#here well searching for relevation correlations , ill consider a correlation relevant if its >= 0.5 and <= -0.5
for (columnName, columnData) in houses_train_corr['SalePrice'].iteritems():
    if columnData >= 0.5 or columnData <= -0.5: #0.5 is the corr that i got the best results
        relevant_salePrice_corr.append((columnName,columnData))

In [10]:
relevant_salePrice_corr

[('OverallQual', 0.7909816005838047),
 ('YearBuilt', 0.5228973328794967),
 ('YearRemodAdd', 0.5071009671113867),
 ('TotalBsmtSF', 0.6135805515591944),
 ('1stFlrSF', 0.6058521846919166),
 ('GrLivArea', 0.7086244776126511),
 ('FullBath', 0.5606637627484452),
 ('TotRmsAbvGrd', 0.5337231555820238),
 ('GarageCars', 0.640409197258349),
 ('GarageArea', 0.6234314389183598),
 ('SalePrice', 1.0),
 ('TotalConstructedSF', 0.809741322904717),
 ('TotalBath', 0.6317310679319897),
 ('ExterQual_TA', -0.589043523409763),
 ('KitchenQual_TA', -0.5192978536548846)]

In [11]:
#We'll not use the GarageArea column because it may be too much correlated to GarageCars
houses_train_corr['GarageArea']['GarageCars']

0.8824754142814603

In [12]:
relevant_columns_names_train = []
for i in relevant_salePrice_corr:
    relevant_columns_names_train.append(i[0])

In [13]:
relevant_columns_names_train.remove('GarageArea')

In [14]:
houses_train  = houses_train[relevant_columns_names_train]

# Test dataset for Kaggle submissions

In [15]:
relevant_columns_names_test = relevant_columns_names_train.remove('SalePrice')
houses_test = houses_test[relevant_columns_names_train]

# Train and test sets split

In [16]:
y = houses_train['SalePrice']
X = houses_train.drop(['SalePrice'],axis = 1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## KFold

In [41]:
kfold = KFold(n_splits=5,shuffle=True,random_state=116)

# Evaluating models

## 1)Linear regression

In [19]:
linearRegressionPipeline = Pipeline(steps=[
    ('model',LinearRegression())
]
)

## 2)Linear regression with StandardScaler and PolynomialFeatures

In [32]:
linearRegressionPipeline2 = Pipeline(steps=[
    ('scalar',StandardScaler()),
    ('poly',PolynomialFeatures(degree = 2)),
    ('model',LinearRegression())
]
)

## 3)Ridge Regression

In [53]:
ridgeRegressionPipeline = Pipeline(steps=[
    ('scalar',StandardScaler()),
    ('poly',PolynomialFeatures(degree = 2)),
    ('model',Ridge(alpha=10,fit_intercept=True))
]
)

## 4)Lasso Regression

In [64]:
LassoRegressionPipeline = Pipeline(steps=[
    ('scalar',StandardScaler()),
    ('model',Lasso())
]
)

## 5)Support Vector Regression

In [63]:
SVRegressionPipeline = Pipeline(steps=[
    ('scalar',StandardScaler()),
    ('model',SVR())
]
)

## 6)Random Forest Regression

In [66]:
RFRegressionPipeline = Pipeline(steps=[
    ('model',RandomForestRegressor())
]
)

## 7)XGBoost Regression

In [65]:
XGBRegressionPipeline = Pipeline(steps=[
    ('model',xgbr)
]
)