In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [57]:
independent_feature=pd.read_csv('X_train.csv')
testing_dataset=pd.read_csv('X_test.csv')
dependent_feature=pd.read_csv('y_train.csv')

In [60]:
independent_feature.shape,dependent_feature.shape,testing_dataset.shape

((1460, 21), (1460, 1), (1459, 21))

# Model Selection

In [46]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=10)

In [47]:
from sklearn.model_selection import train_test_split

In [61]:
X_train, X_test, y_train, y_test=train_test_split(independent_feature,dependent_feature,test_size=0.3,random_state=20)

In [62]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((1022, 21), (1022, 1), (438, 21), (438, 1))

In [66]:
#Pipelines Creation

In [102]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [103]:
randomForest_pipe=Pipeline([('Scaler1',StandardScaler()),
                           ('PCA1',PCA(n_components=2)),
                           ('rf_Classifier',RandomForestRegressor())])

In [104]:
decision_tree_pipe=Pipeline([('Scaler2',StandardScaler()),
                           ('PCA2',PCA(n_components=2)),
                           ('dt_Classifier',DecisionTreeRegressor())])

In [105]:
linear_pipe=Pipeline([('Scaler3',StandardScaler()),
                      ('PCA3',PCA(n_components=2)),
                       ('l_classifier',LinearRegression())])

In [106]:
knn_pipe=Pipeline([('Scaler4',StandardScaler()),
                   ('PCA4',PCA(n_components=2)),
                   ('knn_classifier',KNeighborsRegressor())
    
])

In [107]:
Pipelines=[randomForest_pipe,decision_tree_pipe,linear_pipe,knn_pipe]

In [108]:
for pipe in Pipelines:
    pipe.fit(X_train,y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [109]:
pipe_dict={0:'RandomForestRegressor',1:'DecissionTreeRegressor',2:'LinearRegression',3:'KNNClassifier'}

In [110]:
for i,model in enumerate(Pipelines):
    print(f'{pipe_dict[i]} has {model.score(X_test,y_test)} % Accuracy')

RandomForestRegressor has 0.7664762158640515 % Accuracy
DecissionTreeRegressor has 0.6252693014590973 % Accuracy
LinearRegression has 0.7825848994595652 % Accuracy
KNNClassifier has 0.7823771798914596 % Accuracy


In [117]:
best_accuracy=0
best_model=''
for i,model in enumerate(Pipelines):
    score=model.score(X_test,y_test)
    if score>best_accuracy:
        best_accuracy=score
        best_model=pipe_dict[i]
print(f'Perfact Model for this dataset is {best_model} with highest accuracy among all i.e {best_accuracy}')

Perfact Model for this dataset is LinearRegression with highest accuracy among all i.e 0.7825848994595652


# Hyper Parameter Tunning with the help of GridSearchCV in Pipelines

In [129]:
pipe=Pipeline([('Regressor',RandomForestRegressor())])

In [130]:
from sklearn.model_selection import GridSearchCV

In [144]:
grid_params=[
            {'Regressor':[RandomForestRegressor()],
             'Regressor__n_estimators':np.linspace(10,1000,10),
             'Regressor__max_depth':[2,4,6,8,10,None],
             'Regressor__min_samples_split':[1,2,3,4,5],
             'Regressor__min_samples_leaf':[1,2,3,4,5],
             'Regressor__max_features':["auto", "sqrt", "log2"],
             'Regressor__criterion':['mse','mae'],
             'Regressor__max_samples':[2,4,6,8,10]
            },
            {'Regressor':[DecisionTreeRegressor()],
             'Regressor__splitter':["best", "random"],
             'Regressor__max_depth':[2,4,6,8,10,None],
             'Regressor__min_samples_split':[1,2,3,4,5],
             'Regressor__min_samples_leaf':[1,2,3,4,5],
             'Regressor__max_features':["auto", "sqrt", "log2"],
             'Regressor__criterion':["mse", "friedman_mse", "mae", "poisson"],
            },
    
            {'Regressor':[LinearRegression()], 
             'Regressor__fit_intercept':["True", "False"],
             'Regressor__normalize':["True", "False"],
             'Regressor__copy_X':["True", "False"],
             'Regressor__positive':["True", "False"]
            },
            
            {'Regressor':[KNeighborsRegressor()], 
             'Regressor__n_neighbors':np.linspace(10,400,10),
             'Regressor__weights':['uniform', 'distance'],
             'Regressor__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
             'Regressor__n_jobs':[-1,None]
            
            }
]

In [145]:
gridsearch=GridSearchCV(pipe,grid_params,cv=5,verbose=0,n_jobs=-1)
best_model=gridsearch.fit(X_train,y_train)



In [146]:
best_model.best_estimator_

Pipeline(steps=[('Regressor',
                 LinearRegression(copy_X='True', fit_intercept='True',
                                  normalize='True', positive='True'))])

In [147]:
best_model.score(X_test,y_test)

0.8432350375526405

In [142]:
#After applying HyperParameter Tunning, we are now getting 84.32% Accuracy with LinearRegressor, So For this dataset, LinearRegressor Perfrom very well.

In [None]:
# ----------------------The End-------------------------