In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

In [None]:
sample_submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
sample = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.corr()['SalePrice'].sort_values()

In [None]:
sns.scatterplot(data=train, x='OverallQual', y='SalePrice')
plt.axhline(y=200000,color='r')

In [None]:
train[(train['OverallQual']>8) &(train['SalePrice']<200000)][['SalePrice', 'OverallQual']]

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train)
plt.axhline(y=200000, color='r')
plt.axvline(x=4000, color='r')

In [None]:
train[(train['GrLivArea']>4000) & (train['SalePrice']<400000)][['SalePrice', 'GrLivArea']]

In [None]:
index_drop=train[(train['GrLivArea']>4000) & (train['SalePrice']<400000)].index
train=train.drop(index_drop, axis=0)

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train)
plt.axhline(y=200000, color='r')
plt.axvline(x=4000, color='r')

In [None]:
sns.scatterplot(x='OverallQual', y='SalePrice', data=train)
plt.axhline(y=200000,color='r')

In [None]:
sns.boxplot(x='OverallQual', y='SalePrice', data=train)

In [None]:
#open the documentation about the data set. (this file is in a txt format)

with open('../input/house-prices-advanced-regression-techniques/data_description.txt', 'r') as f:
    print(f.read())

In [None]:

train= train.drop('Id', axis=1)

In [None]:
train.isnull()

In [None]:
#How many missing data is there in each features?
train.isnull().sum()

In [None]:
#The percent of missing data in any feature:
100*(train.isnull().sum()/len(train))

In [None]:
#Make a Function to calculate the percent of missing data in each columns (feature) and then sort it
def missing_percent(train):
    nan_percent= 100*(train.isnull().sum()/len(train))
    nan_percent= nan_percent[nan_percent>0].sort_values()
    return nan_percent

In [None]:
nan_percent= missing_percent(train)

In [None]:
nan_percent

In [None]:
#plot the feature with missing indicating the percent of missing data
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)

In [None]:
#every Feature with missing data must be checked!
#We choose a threshold of 1%. It means, if there is less than 1% of a feature are missing,
#then we will consider just dropping that rows

plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)

#Set 1% threshold:
plt.ylim(0,1)

In [None]:
nan_percent[nan_percent<1]

In [None]:
nan_percent[nan_percent<1].index

In [None]:
100/len(train)
#It shows that, Feature with just one missing rows has this percent value of missing data

In [None]:
train[train['Electrical'].isnull()]

In [None]:
train[train['GarageArea'].isnull()]

In [None]:
train= train.dropna(axis=0, subset=['Electrical', 'GarageArea','MasVnrArea'])

In [None]:
nan_percent= missing_percent(train)

plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
plt.ylim(0,1)

### Filling the missing values:

In [None]:
train[train['TotalBsmtSF'].isnull()]

In [None]:
train[train['BsmtHalfBath'].isnull()]

In [None]:
train[train['BsmtFullBath'].isnull()]

In [None]:
#After checking the data documentation,
#it shows that missing value (two rows) in Basement Features are because of there is no basement in these rows
#Decision: Filling in data based on column: numerical basement & string descriptive:

#Numerical Columns fill with 0:
bsmt_num_cols= ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF' ,'BsmtFullBath', 'BsmtHalfBath']
train[bsmt_num_cols]=train[bsmt_num_cols].fillna(0)

#String Columns fill with None:
bsmt_str_cols= ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
train[bsmt_str_cols]= train[bsmt_str_cols].fillna('None')

In [None]:
nan_percent= missing_percent(train)

plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
plt.ylim(0,1)

Mas Vnr Features:

Based on the Dataset Document File, missing values for 'Mas Vnr Type' and 'Mas Vnr Area' means the house doesn't have any mansonry veneer. so, we decide to fill the missing value as below:


In [None]:
train["MasVnrType"]= train["MasVnrType"].fillna("None")
train["MasVnrArea"]= train["MasVnrArea"].fillna(0)

In [None]:
nan_percent= missing_percent(train)

plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)


Garage Columns:¶

Based on the dataset documentation, NaN in Garage Columns seems to indicate no garage.

Decision: Fill with 'None' or 0


In [None]:
train[['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']]

In [None]:
#Filling the missing Value:
Gar_str_cols= ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
train[Gar_str_cols]=train[Gar_str_cols].fillna('None')

train['GarageYrBlt']=train['GarageYrBlt'].fillna(0)

In [None]:
nan_percent= missing_percent(train)

plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)


### C-Working based on Columns Missing Data

In [None]:
nan_percent.index

In [None]:
train[['LotFrontage', 'FireplaceQu', 'Fence', 'Alley', 'MiscFeature',
       'PoolQC']]

In [None]:
train= train.drop(['Fence', 'Alley', 'MiscFeature','PoolQC'], axis=1)

In [None]:
nan_percent= missing_percent(train)

plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)

In [None]:
#Filling in Fireplace Quality based on dataset documentation:
train['FireplaceQu']= train['FireplaceQu'].fillna('None')

In [None]:
nan_percent= missing_percent(train)

plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)

### D- Imputation of Missing Data

In [None]:
train['Neighborhood'].unique()

In [None]:
plt.figure(figsize=(8,12))
sns.boxplot(data=train, x='LotFrontage', y='Neighborhood')

In [None]:
#Impute missing data based on other columns:

train.groupby('Neighborhood')['LotFrontage']

In [None]:
train.groupby('Neighborhood')['LotFrontage'].mean()

In [None]:
train.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))

In [None]:
train['LotFrontage']=train.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))

In [None]:
train['LotFrontage']= train['LotFrontage'].fillna(0)

In [None]:
nan_percent= missing_percent(train)

In [None]:
nan_percent

## 3-Dealing with Categorical Data

#### A- Numerical Columns to Categorical
We need to be careful when it comes to encoding categorical as numbers. We want to make sure that the numerical relationship makes sense for model. For example, the encoding MSSubClass is essentially just a code per class

In [None]:
train['MSSubClass']

In [None]:
train.info()

In [None]:
train['MSSubClass'].unique()

In [None]:
#Convert to String:
train['MSSubClass']= train['MSSubClass'].apply(str)

In [None]:
train.info()
#or: df['MS SubClass'].dtype

### B- Creating Dummy Variables

In [None]:
train.select_dtypes(include='object')

In [None]:
df_num= train.select_dtypes(exclude='object')
df_obj= train.select_dtypes(include='object')

In [None]:
df_num.info()

In [None]:
df_obj.info()

In [None]:
# Converting:
df_obj= pd.get_dummies(df_obj, drop_first=True)

In [None]:
df_obj.shape

In [None]:
Final_df= pd.concat([df_num, df_obj], axis=1)

In [None]:
Final_df.head()

In [None]:
Final_df.isnull()

In [None]:
X=Final_df.drop(['SalePrice'],axis=1)
y=Final_df['SalePrice']

#### Now, the Dataset is Ready for any Machine Learing Model & Analysis

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
model=LinearRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
pd.DataFrame(model.coef_,X.columns,columns=['Con'])

In [None]:
y_predict=model.predict(X_test)

In [None]:
pd.DataFrame({'Y_test':y_test,'Y_predict':y_predict})

In [None]:
MAE=metrics.mean_absolute_error(y_test,y_predict)
MSE=metrics.mean_squared_error(y_test,y_predict)
RMSE=np.sqrt(MSE)

In [None]:
print('Root Mean Square Error test = ',RMSE)

In [None]:
RES_test=y_test-y_predict
pd.DataFrame({'Y_test':y_test,'Y_predict':y_predict,'RES':RES_test})

In [None]:
sns.scatterplot(x=y_test,y=y_predict)

In [None]:
sns.scatterplot(x=y_test,y=RES_test)
plt.axhline(y=0,color='r',ls='--')

In [None]:
sns.displot(RES_test,bins=30, kde=True)