In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('train.csv');

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
missing_data = data.isnull().sum()
missing_data = missing_data[missing_data > 0]
missing_data.sort_values(inplace=True)
plt.figure(figsize=(15,8))
missing_data.plot.bar()

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.displot(data['SalePrice'], kde=True, bins=20)

In [None]:
data['SalePrice'].describe()

In [None]:
numeric_features = data.select_dtypes(include=[np.number])
numeric_features.columns

In [None]:
categorial_features = data.select_dtypes(include=[object])
categorial_features.columns

In [None]:
correlation = numeric_features.corr()
correlation['SalePrice'].sort_values(ascending=False)

In [None]:
plt.title('Correlation of numeric features with sale price', y=1, size=16)
sns.heatmap(correlation, square=True, vmax=0.8)

In [None]:
k=11
cols = correlation.nlargest(k, 'SalePrice')['SalePrice'].index
cols

In [None]:
cm = np.corrcoef(data[cols].values.T)
sns.heatmap(cm, vmax=0.8, linewidths=0.01, square=True, annot=True, cmap='viridis', linecolor='white', xticklabels=cols.values, annot_kws={'size':12}, yticklabels=cols.values)

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7,ax8)) = plt.subplots(nrows=4, ncols=2)
sns.scatterplot(data=data, x='GarageCars', y='SalePrice', ax=ax1)
sns.scatterplot(data=data, x='OverallQual', y='SalePrice', ax=ax2)
sns.scatterplot(data=data, x='GrLivArea', y='SalePrice', ax=ax3)
sns.scatterplot(data=data, x='FullBath', y='SalePrice', ax=ax4)
sns.scatterplot(data=data, x='YearBuilt', y='SalePrice', ax=ax5)
sns.scatterplot(data=data, x='GarageCars', y='SalePrice', ax=ax6)
sns.scatterplot(data=data, x='WoodDeckSF', y='SalePrice', ax=ax7)
sns.scatterplot(data=data, x='YearRemodAdd', y='SalePrice', ax=ax8)

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7,ax8)) = plt.subplots(nrows=4, ncols=2)
sns.regplot(data=data, x='GarageCars', y='SalePrice', scatter=True, ax=ax1)
sns.regplot(data=data, x='OverallQual', y='SalePrice', scatter=True, ax=ax2)
sns.regplot(data=data, x='GrLivArea', y='SalePrice', scatter=True, ax=ax3)
sns.regplot(data=data, x='FullBath', y='SalePrice', scatter=True, ax=ax4)
sns.regplot(data=data, x='YearBuilt', y='SalePrice', scatter=True, ax=ax5)
sns.regplot(data=data, x='GarageCars', y='SalePrice', scatter=True, ax=ax6)
sns.regplot(data=data, x='WoodDeckSF', y='SalePrice', scatter=True, ax=ax7)
sns.regplot(data=data, x='YearRemodAdd', y='SalePrice', scatter=True, ax=ax8)

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2)
sns.boxplot(data=data, x='GarageCars', y='SalePrice', ax=ax1)
sns.boxplot(data=data, x='OverallQual', y='SalePrice', ax=ax2)
sns.boxplot(data=data, x='FullBath', y='SalePrice', ax=ax3)
sns.boxplot(data=data, x='GarageCars', y='SalePrice', ax=ax4)

In [None]:
Q1 = data['SalePrice'].quantile(0.25)
Q3 = data['SalePrice'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_no_outliers = data[(data['SalePrice'] >= lower_bound) & (data['SalePrice'] <= upper_bound)].dropna()
data_no_outliers.reset_index(drop=True, inplace=True)

In [None]:
columns_to_remove = [
    'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'HalfBath', 'LotArea', 
    'BsmtFullBath', 'BsmtUnfSF', 'BedroomAbvGr', 'ScreenPorch', 
    'PoolArea', 'MoSold', '3SsnPorch', 'BsmtFinSF2', 'BsmtHalfBath', 
    'MiscVal', 'Id', 'LowQualFinSF', 'YrSold', 'OverallCond', 
    'MSSubClass', 'EnclosedPorch', 'KitchenAbvGr'
]

In [None]:
data.drop(columns=columns_to_remove, inplace=True)