## Load the train data and split it



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel
from sklearn import set_config
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
set_config(transform_output="pandas")
from sklearn.ensemble import RandomForestRegressor

In [None]:
url = "https://drive.google.com/file/d/1c3XaTRGyN9Cy2Ffnb26DlfZowi3zxE2T/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
train_data = pd.read_csv(path)

In [None]:
train_data.info()

In [None]:
X=train_data.drop("Id",axis=1)

In [None]:
y = X.pop("SalePrice")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123)

## Load the test data and pop the id column

In [None]:
url = "https://drive.google.com/file/d/1-fP60UWTyCb45r7vBrPOkU_LrE0TTn-Q/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
test_data = pd.read_csv(path)

In [None]:
id_column = test_data.pop('Id')

In [None]:
id_column

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

## Create the pipeline

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
X_cat = X_train.select_dtypes(exclude = 'number').columns
X_num = X_train.select_dtypes(include = 'number').columns

numeric_pipe = make_pipeline(SimpleImputer(strategy='mean'))

categoric_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
                               OneHotEncoder(handle_unknown='ignore',sparse_output=False))

preprocessor = make_column_transformer(
          (numeric_pipe, make_column_selector(dtype_include='number')),
          (categoric_pipe, make_column_selector(dtype_include='object'))
)

my_pipeline = make_pipeline(
    preprocessor,
    StandardScaler(),
    GradientBoostingRegressor(random_state=42)
)

In [None]:
my_pipeline

## Optimise the Gradient Boosting model parameters (GridSearchCV)

In [None]:
grad_param_grid = {
    "gradientboostingregressor__max_depth": range(4, 12, 2),
    "gradientboostingregressor__min_samples_split": range(4, 12, 2),
    "gradientboostingregressor__min_samples_leaf": range(1, 4)
}
# GridSearchCV for Gradient Boosting Regressor
grad_search = GridSearchCV(my_pipeline,
                          grad_param_grid,
                          cv=5,
                          n_jobs= -1,
                          verbose=1)


## Train the model

In [None]:
import sklearn.metrics
sklearn.metrics.get_scorer_names()

In [None]:
grad_search.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
my_model = grad_search.best_estimator_
my_model

## Evaluate the model

In [None]:
train_pred = my_model.predict(X_train)
test_pred = my_model.predict(X_test)

In [None]:
train_rmse = mean_squared_error(y_train, train_pred, squared=False)
test_rmse = mean_squared_error(y_test, test_pred, squared=False)
train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)
train_rmsl = mean_squared_log_error(y_train, train_pred, squared=False)
test_rmsl = mean_squared_log_error(y_test, test_pred, squared=False)
train_map = mean_absolute_percentage_error(y_train, train_pred)
test_map = mean_absolute_percentage_error(y_test, test_pred)


print(f'Model Train MAP: {train_map:.2f}')
print(f'Model Test MAP : {test_map:.2f}\n')
print(f'Model Train r2: {train_r2:.2f}')
print(f'Model Test r2 : {test_r2:.2f}\n')
print(f'Model Train RMSE: {train_rmse:.2f}')
print(f'Model Test RMSE : {test_rmse:.2f}\n')
print(f'Model Train RSML: {train_rmsl:.2f}')
print(f'Model Test RSML : {test_rmsl:.2f}\n')

Model Train MAP: 0.05
Model Test MAP : 0.08

Model Train r2: 0.98
Model Test r2 : 0.91

Model Train RMSE: 10362.60
Model Test RMSE : 23925.94

Model Train RSML: 0.07
Model Test RSML : 0.11



## Retrain the model on the full test dataset

In [None]:
my_model.fit(X,y)

## Make predictions with the test dataset and prepare the submission file

In [None]:
predictions = my_model.predict(test_data)

In [None]:
results = pd.DataFrame({
    'Id':id_column,
    'SalePrice':predictions
})

In [None]:
results

Unnamed: 0,Id,SalePrice
0,1461,120935.485171
1,1462,157830.064253
2,1463,179486.134440
3,1464,189963.412908
4,1465,188433.018209
...,...,...
1454,2915,82501.579088
1455,2916,81473.027845
1456,2917,160666.486322
1457,2918,122019.053560


## Submission File

In [None]:
#from google.colab import files
#results.to_csv('submission_priyanka_grad.csv',index=False)
#files.download('submission_priyanka_grad.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>