In [None]:
#importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
from scipy.stats import skew
import warnings
warnings.filterwarnings('ignore')

In [None]:
# loading data
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test  = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train.head()

In [None]:
train.shape, test.shape

In [None]:
train.columns

In [None]:
# missing data from train dataset
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data.head(30)

In [None]:
# Dealing with missing data
train = train.drop((missing_data[missing_data['Total']>1]).index,1)
train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])
train.isnull().sum().max()

In [None]:
# After deleting the columns 
train.columns

In [None]:
# missing data from test dataset
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data1 = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data1.head(40)

In [None]:
test = test.drop((missing_data1[missing_data1['Total']>4]).index,1)

In [None]:
# missing data from test dataset
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data1 = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data1.head(40)

In [None]:
null_features = (missing_data1[missing_data1['Total']>0]).index
null_features

In [None]:
for feature in null_features:
    test[feature] = test[feature].fillna(test[feature].mode()[0])

In [None]:
test.isnull().sum().max()

In [None]:
#Descriptive statistics summary
train['SalePrice'].describe()

In [None]:
#histogram
sns.distplot(train['SalePrice']);

In [None]:
# correlation matrix
corrmat = train.corr()
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat,vmax=.8,square = True);

In [None]:
# most correlated features 
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat['SalePrice'])>0.5]
plt.figure(figsize=(10,10))
sns.heatmap(train[top_corr_features].corr(),annot = True);
top_corr_features

In [None]:
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols], size = 2.5)
plt.show();

In [None]:
# differentiate between numerical and categorical varibles
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns

In [None]:
# taking numerical dataset and categorical datasets separately 
train_num = train[numerical_features]
train_cat = train[categorical_features]

In [None]:
# checkin skewmess of all features
skewness = train_num.apply(lambda x: skew(x))
skewness.sort_values(ascending=False)

In [None]:
skewness = skewness[abs(skewness)>0.5]
skewness.index

In [None]:
train_num

In [None]:
# applying log tranform
train_num[skewness.index] = np.log1p(train_num[skewness.index])

In [None]:
train_num.shape

In [None]:
# using get_dummies here it is used for data manipulation. It converts categorical data into dummy or indicator variables
train_cat = pd.get_dummies(train_cat)

In [None]:
train_cat.shape

In [None]:
# concatenating train_num (numerical variable) and train_cat (categorical variable)
train1 = pd.concat([train_cat,train_num],axis=1)

In [None]:
# differentiate between numerical and categorical varibles
categorical_features = test.select_dtypes(include = ["object"]).columns
numerical_features = test.select_dtypes(exclude = ["object"]).columns

test_num = train[numerical_features]
test_cat = train[categorical_features]

In [None]:
# finding skewness of all features
skewness = test_num.apply(lambda x: skew(x))
skewness.sort_values(ascending=False)

In [None]:
# we are selecting features where skewness is greater than 0.5 to fix their skewness
skewness = skewness[abs(skewness)>0.5]
skewness.index

In [None]:
# applying log tranform
test_num[skewness.index] = np.log1p(test_num[skewness.index])

In [None]:
test_cat = pd.get_dummies(test_cat)

In [None]:
test1 = pd.concat([test_cat,test_num],axis=1)

In [None]:
# set minimum and maximum threshold values to detect ouliers using standard deviation
min_threshold = train1.SalePrice.mean() - 3*train1.SalePrice.std()
max_threshold = train1.SalePrice.mean() + 3*train1.SalePrice.std()

In [None]:
min_threshold,max_threshold

In [None]:
# removing the outlier's from dataset
train1 = train1[(train1.SalePrice>max_threshold) & (train1.SalePrice)<min_threshold]

In [None]:
# importing all the required library for modeling
import statsmodels.api as sm

In [None]:
X = train1.drop(['SalePrice'],axis=1)
y = train1['SalePrice']

In [None]:
X = sm.add_constant(X)
test = sm.add_constant(test1)

In [None]:
X.shape,y.shape

In [None]:
model = sm.OLS(y, X).fit()

In [None]:
predictions = model.predict(test)

In [None]:
print("ROOT MEAN SQUARED ERROR : ",math.sqrt(sum((y-predictions)**2)/len(y)))

In [None]:
model.summary()

In [None]:
#Create a  DataFrame with the passengers ids and our prediction
submission = pd.DataFrame({'Id':test_id,'SalePrice':y_pred_b})

In [None]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = './HousePredictions.csv'

submission.to_csv(filename,index=False)

#print('Saved file: ' + filename)