In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn import ensemble

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
print(train.shape, test.shape)

In [None]:
trainID = train['Id']
testID = test['Id']

In [None]:
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

In [None]:
# Our target variable is SalePrice
# We drop that variable from training data
data = pd.concat([train.drop('SalePrice', axis = 1),test], axis = 0)

In [None]:
data.isnull().sum().sort_values(ascending=False)

In [None]:
#correlation matrix
corrmat = data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
#Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
data["LotFrontage"] = data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))


In [None]:
# GarageType, GarageFinish, GarageQual and GarageCond : Replacing missing data with None
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    data[col] = data[col].fillna('None')


In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath : missing values are likely zero for having no basement
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    data[col] = data[col].fillna(0)

In [None]:
# GarageYrBlt, GarageArea and GarageCars : Replacing missing data with 0 (Since No garage = no cars in such garage.)
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    data[col] = data[col].fillna(int(0))

In [None]:
# BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1 and BsmtFinType2 : For all these categorical basement-related features, NaN means that there is no basement.
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    data[col] = data[col].fillna('None')

In [None]:
# MasVnrArea and MasVnrType : NA most likely means no masonry veneer for these houses. We can fill 0 for the area and None for the type.
data["MasVnrType"] = data["MasVnrType"].fillna("None")
data["MasVnrArea"] = data["MasVnrArea"].fillna(int(0))

In [None]:
# MSZoning (The general zoning classification) : 'RL' is by far the most common value. So we can fill in missing values with 'RL'
data['MSZoning'] = data['MSZoning'].fillna(data['MSZoning'].mode()[0])

In [None]:
#There is put mode value 
data['Electrical'] = data['Electrical'].fillna(data['Electrical']).mode()[0]

#There is no need of Utilities
data = data.drop(['Utilities'], axis=1)

data['PoolQC'] = data['PoolQC'].fillna('None')

data['MiscFeature'].fillna('None', inplace = True)
data['Alley'].fillna('None', inplace = True)
data['Fence'].fillna('None', inplace = True)
data['FireplaceQu'] = data['FireplaceQu'].fillna('None')
data['KitchenQual'].fillna(data['KitchenQual'].mode()[0], inplace = True)
data['BsmtFullBath'].fillna(0, inplace = True)
data['FullBath'].fillna(data['FullBath'].mode()[0],inplace = True)
data['Functional'].fillna(data['Functional'].mode()[0],inplace=True)

In [None]:
for col in ['SaleType','KitchenQual','Exterior2nd','Exterior1st','Electrical']:
    data[col].fillna(data[col].mode()[0],inplace=True)

In [None]:
data.isnull().values.any()

In [None]:
cat_features = data.select_dtypes(include = ['object']).columns.values
cat_features

In [None]:
num_features = data.select_dtypes(include = ['float64', 'int64']).columns.values
num_features

In [None]:
#MSSubClass=The building class
data['MSSubClass'] = data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
data['OverallCond'] = data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
data['YrSold'] = data['YrSold'].astype(str)
data['MoSold'] = data['MoSold'].astype(str)

In [None]:
# Label Encoding some categorical variables that may contain information in their ordering set

cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold', 'MSZoning', 'LandContour', 'LotConfig', 'Neighborhood',
        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'MasVnrType', 'Foundation', 'GarageType', 'MiscFeature', 
        'SaleType', 'SaleCondition', 'Electrical', 'Heating')

In [None]:
# process columns, apply LabelEncoder to categorical features

for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(data[c].values))
    data[c] = lbl.transform(list(data[c].values))

In [None]:
data.shape

In [None]:
# split data
trainData = data.iloc[:1460,:]
testData = data.iloc[1460:,:]

In [None]:
X = trainData
y = train['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=4)

In [None]:
LR = LinearRegression()
LR.fit(X_train, y_train)

In [None]:
LR.score(X_test, y_test)

In [None]:
pred = LR.predict(X_test)

In [None]:
print('r2_score: ', r2_score(y_test, pred))
print('Mean Absolute Error(MAE): ', mean_absolute_error(y_test, pred))
print('Root Mean Squared Error(RMSE): ', np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
        learning_rate = 0.09580, loss = 'ls')

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
clf.fit(X, y)

In [None]:
Pred = clf.predict(testData)

In [None]:
submission = pd.DataFrame({'Id':testID, 'SalePrice':Pred})

In [None]:
submission.to_csv('houseprice.csv', index=False)