In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import math

import and load the datasets 

In [4]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [5]:
train_data.head()

In [6]:
test_data.head()

In [7]:
train_data.describe()

In [8]:
train_data.info()

In [10]:
all_features_except_target = [i for i in train_data.columns if i not in ['SalePrice'] ]
features_X = train_data[all_features_except_target]
y = train_data['SalePrice']


train_data_copy = features_X.copy()
train_data_copy.head()

before starting any preprocessing, i would like to analyse more. Lets explore through visualization

In [11]:
sns.displot(train_data['SalePrice'])

In [12]:
train_data['SalePrice'].describe()

From the above plot and describe() function, we can conclude that the plot is left skewed and the plot is densed between 100k and 300k in X_axis.

As there are many features, it will be easy for us to plot a correlation map as it will help us to learn more about correlations between each variable with target and other variables.


In [13]:
corr_mat = train_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_mat, vmax=0.8, square=True)

The plot above shows the relationship between each variable with other, there are negative, positive and no relationships. As we would like to know more positively correlated features, lets filter the map to display positively correlated features.

In [14]:
positive_correlations = train_data.corr()
positive_feature_columns = positive_correlations['SalePrice'][positive_correlations['SalePrice'].values > 0.2].index.values
positive_feature_columns

In [15]:
positive_corr_mat = train_data[positive_feature_columns].corr()
f, ax = plt.subplots(figsize=(16, 16))
sns.heatmap(positive_corr_mat, vmax=0.8, square=True, annot=True)

From the above heatmap, the following observations are made:

* OverallQual', 'GrLivArea' , 'TotalBsmtSF','GarageCars' and 'GarageArea' have strong correlation with 'SalePrice'.
* 'GarageCars' and 'GarageArea'are mutually dependent
* Same mutual dependence applies to the two features 'TotalBsmtSF' and '1stFloor' . We will take only 'TotalBsmtSF' in our feature-engineering.
* AND ALSO 'TotRmsAbvGrd' and 'GrLivArea', and we will only take 'GrLivArea'

Lets check out the missing data

In [16]:
total = train_data.isnull().sum().sort_values(ascending=False)
percent = (train_data.isnull().sum() / train_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Lets now work on missing values in the columns. Lets first do the numerical and categorical values. 

In [17]:
#Before that while observing the positively correlated variables, i came across some missing values in 'GarageYrBlt' and 
#i feel that its ok to be replaced by the 'YearBuilt' as its better than replacing with 0.
train_data_copy.loc[train_data.GarageYrBlt.isnull(),'GarageYrBlt'] = train_data_copy.loc[train_data.GarageYrBlt.isnull(),'YearBuilt']


In [18]:
#Numerical columns
feature_numerical_columns = [col_name for col_name in train_data_copy.columns if
                train_data_copy[col_name].dtype in ['int64', 'float64']]

In [19]:
feature_categorical_cols = [col_name for col_name in train_data_copy.columns if
                    train_data_copy[col_name].nunique() < 50 and
                    train_data_copy[col_name].dtype in ['object', 'bool']]

Lets replace the null values in numerical columns with mean or meridian using imputer. Before that lets modify some columns

In [20]:
train_data_copy['years_since_update'] = train_data_copy['YearRemodAdd'] - train_data_copy['YearBuilt']
train_data_copy['garage_value'] = train_data_copy['YearBuilt'] * train_data_copy['GarageCars']

#train_data_copy = train_data_copy.drop(columns=['GarageCars'])

In [21]:
feature_numerical_transformer = SimpleImputer(strategy='mean')

Lets replace NaN values with frequently used values using imputer

In [22]:
feature_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [34]:
feature_preprocessor = ColumnTransformer(
    transformers=[
        ('num', feature_numerical_transformer, feature_numerical_columns),
        ('cat', feature_categorical_transformer, feature_categorical_cols),
        
])

Lets model our data 

In [39]:
#Linear Regression:
#regr_model = LinearRegression()
feature_classifier = Pipeline(steps=[('feature_preprocessor', feature_preprocessor),
                      ('feature_model',LinearRegression(fit_intercept=True)),
                     ])

feature_X_train, feature_X_valid, feature_y_train, feature_y_test = train_test_split(train_data_copy, y, random_state=0)

feature_classifier.fit(feature_X_train, feature_y_train)
feature_predictions = feature_classifier.predict(feature_X_valid)

print('Root Mean Square Error is ', np.sqrt(mean_squared_error(feature_y_test, feature_predictions)))


# Implementing Scikit-learn function for RMSLE
print('RMSLE using scikit-learn function', mean_squared_log_error(feature_y_test, feature_predictions))
#print('Intercept is ', regr_model.intercept_)

# For retrieving the slope (coefficient of x). This will be an array of values.
#print("Slope i.e. coefficient of x is ", regr_model.coef_)


In [40]:
feature_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.02, max_delta_step=0, max_depth=4,
             min_child_weight=0.0, monotone_constraints='()',
             n_estimators=1250, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

feature_classifier = Pipeline(steps=[('feature_preprocessor', feature_preprocessor),
                      ('feature_model', feature_model)
                     ])

feature_X_train, feature_X_valid, feature_y_train, feature_y_test = train_test_split(train_data_copy, y, random_state=0)

feature_classifier.fit(feature_X_train, feature_y_train, feature_model__verbose=False)
feature_predictions = feature_classifier.predict(feature_X_valid)

# Implementing our earlier custom defined function for RMSLE
# Root Mean Squared Log Error . This metric is used when the Target variable is converted into Log(Target).
#print('RMSLE from custom-defined function', root_mean_squared_log_error(feature_y_test, feature_predictions))

# Implementing Scikit-learn function for RMSLE
print('RMSLE using scikit-learn function', np.sqrt(mean_squared_log_error(feature_y_test, feature_predictions)))
print('Root Mean Square Error is ', np.sqrt(mean_squared_error(feature_y_test, feature_predictions)))


In [41]:
ridge = Ridge(alpha=0.5)
feature_classifier = Pipeline(steps=[('feature_preprocessor', feature_preprocessor),
                      ('feature_model',ridge)
                     ])

feature_X_train, feature_X_valid, feature_y_train, feature_y_test = train_test_split(train_data_copy, y, random_state=0)

feature_classifier.fit(feature_X_train, feature_y_train)
feature_predictions = feature_classifier.predict(feature_X_valid)

print('Root Mean Square Error is ', np.sqrt(mean_squared_error(feature_y_test, feature_predictions)))


# Implementing Scikit-learn function for RMSLE
print('RMSLE using scikit-learn function', np.sqrt(mean_squared_log_error(feature_y_test, feature_predictions)))

In [42]:
lasso = Lasso(alpha=0.5)
feature_classifier = Pipeline(steps=[('feature_preprocessor', feature_preprocessor),
                      ('feature_model',lasso)
                     ])

feature_X_train, feature_X_valid, feature_y_train, feature_y_test = train_test_split(train_data_copy, y, random_state=0)

feature_classifier.fit(feature_X_train, feature_y_train)
feature_predictions = feature_classifier.predict(feature_X_valid)

print('Root Mean Square Error is ', np.sqrt(mean_squared_error(feature_y_test, feature_predictions)))


# Implementing Scikit-learn function for RMSLE
print('RMSLE using scikit-learn function', np.sqrt(mean_squared_log_error(feature_y_test, feature_predictions)))

In [43]:

feature_classifier = Pipeline(steps=[('feature_preprocessor', feature_preprocessor),
                      ('feature_model',RandomForestRegressor(max_depth=50, random_state=2))
                     ])

feature_X_train, feature_X_valid, feature_y_train, feature_y_test = train_test_split(train_data_copy, y, random_state=0)

feature_classifier.fit(feature_X_train, feature_y_train)
feature_predictions = feature_classifier.predict(feature_X_valid)

print('Root Mean Square Error is ', np.sqrt(mean_squared_error(feature_y_test, feature_predictions)))


# Implementing Scikit-learn function for RMSLE
print('RMSLE using scikit-learn function', np.sqrt(mean_squared_log_error(feature_y_test, feature_predictions)))

In [44]:
X_test_original = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

X_test_original['years_since_update'] = X_test_original['YearRemodAdd'] - X_test_original['YearBuilt']
X_test_original['garage_value'] = X_test_original['YearBuilt'] * X_test_original['GarageCars']

#X_test_original = X_test_original.drop(columns=['GarageCars'])

feature_classifier.fit(train_data_copy, y)

model_predictions = feature_classifier.predict(test_data)
output = pd.DataFrame({'Id': X_test_original.Id,
                       'SalePrice': model_predictions})
output.to_csv('submission.csv', index=False)