<a href="https://colab.research.google.com/github/TheHouseOfVermeulens/wernervermeulen.github.io/blob/master/housing_prices_competition_kaggle_learners_top_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This Notebook is in collaboration with team members which received a top 10% ranking on Kaggle.

# Regression

Basic Libraries

In [0]:
#Importing the libraries
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from lightgbm import LGBMRegressor
from sklearn.decomposition import PCA, TruncatedSVD

from math import sqrt
import warnings
warnings.filterwarnings('ignore')

Loading Data in Kaggle

In [0]:
#Reading data files
import os
if not os.path.exists("../input/train.csv"):
    os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")  
    os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv") 

iowa_file_path = '../input/train.csv'
home_data = pd.read_csv(iowa_file_path)
test_data_path = '../input/test.csv'
test_data = pd.read_csv(test_data_path)
sample_path = '../input/sample_submission.csv'
sample = pd.read_csv(sample_path)
RANDOM_STATE=0  #to make use of in train-test split as well as model for randomness

Feature Engineering

In [0]:
#Dropping features with high missing values
train_clean=home_data.drop(columns=['MiscFeature','Fence','PoolQC','FireplaceQu','Alley'])

In [0]:
#Creating features and labels
X=train_clean.drop(columns=['SalePrice'])
y=home_data[['SalePrice']]

In [0]:
#Split the dataset to evaluate out sample performance
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.15, random_state=RANDOM_STATE)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1241, 75) (219, 75) (1241, 1) (219, 1)


In [0]:
#Seperation of numerical and categorical features
num_feat=X_train.select_dtypes(include='number').columns.to_list()
cat_feat=X_train.select_dtypes(exclude='number').columns.to_list()

Pipeline

In [0]:
num_pipe=Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipe=Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
ct=ColumnTransformer(remainder='drop',
                    transformers=[
                        ('numerical', num_pipe, num_feat),
                        ('categorical', cat_pipe, cat_feat)
                    ])
model=Pipeline([
    ('transformer', ct),   
    ('predictor', GradientBoostingRegressor())
])

In [0]:
model.fit(X_train, y_train);

Prediction

In [0]:
y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)

Evaluation of Result

In [0]:
print('In sample MAE error: ', round(mean_absolute_error(y_pred_train, y_train)))
print('Out sample MAE error: ', round(mean_absolute_error(y_pred_test, y_test)))

In sample MAE error:  10313.0
Out sample MAE error:  15660.0


In [0]:
#model.fit(X,y);

Result Submission

In [0]:
def submission(test, model):
    y_pred=model.predict(test)
    result=pd.DataFrame({'Id':sample.Id, 'SalePrice':y_pred})
    result.to_csv('/kaggle/working/result.csv',index=False)

In [0]:
submission(test_data, model)

In [0]:
check=pd.read_csv('/kaggle/working/result.csv')
check.head()

Unnamed: 0,Id,SalePrice
0,1461,117347.030253
1,1462,160998.107331
2,1463,184621.841886
3,1464,185997.375609
4,1465,200321.890782
