In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats
from sklearn import preprocessing
import io

In [None]:
df= pd.read_csv('../input/house-prices-dataset/train.csv')

In [None]:
df.shape

In [None]:
df.head()

### Droping the id column as its not needed

In [None]:
df.drop('Id',axis='columns',inplace= True)

## Using info() and describe() to understand the dataset better

In [None]:
df.info()

In [None]:
df.describe()

#### count of null values in each column

In [None]:
df[df.columns[df.isnull().any()]].isnull().sum()

#### Handling of the null values by using Central tendency methods: Mode for categorical features and Median for Numerical features (we are not choosing Mean over Median is because mean is influenced by outliers thus. might change the pattern of the features from its original drasctically



####  lets fill the nulls for Categorical features with mode:

In [None]:
cols = ['Alley','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical','FireplaceQu','GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
mode= df.filter(cols).mode()

In [None]:
df[cols]= df[cols].fillna(value= mode.iloc[0])

#### Now lets do the same for numerical features with median

In [None]:
num_cols= ['LotFrontage','MasVnrArea']
median= df.filter(num_cols).median()

In [None]:
df[num_cols]= df[num_cols].fillna(value= median.iloc[0])

##### now lets check again the number of null values to varify that our logic worked or not

In [None]:
df[df.columns[df.isnull().any()]].isnull().sum()

## Categorizing data set into Categorial features and Numerical features

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
categorical_data = df.select_dtypes(exclude=[np.number])

# Numerical feature Analysis


In [None]:
numeric_data.columns

## **Outlier Analysis**

### Visualization of outliers using Boxplot :

In [None]:
def box(variable):
    plt.figure(figsize = (9,3))
    plt.boxplot(numeric_data[variable])
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

In [None]:
numericVar = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', 'LowQualFinSF', 'GrLivArea', 'BsmtHalfBath',
       'FullBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
for n in numericVar:
    box(n)

## Percentage of Outliers in each column

In [None]:
def outlier(col): 
    q1= numeric_data[col].quantile(0.25)
    q3=numeric_data[col].quantile(0.75)
    IQR= q3-q1
    lower= q1-(IQR*1.5)
    upper= q3+(IQR*1.5)
    n= len(numeric_data.loc[np.where((numeric_data[col] > upper) | (numeric_data[col] < lower))])
    perc= (n/1460)*100
    print(f'{col}= {perc}')
        

In [None]:
for n in numericVar:  
    outlier(n)

In [None]:
def his(variable):
    plt.figure(figsize = (9,3))
    plt.hist(numeric_data[variable],bins=50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

### Analysis of  numerical columns using histogram

In [None]:
for n in numericVar:
    his(n)

### Observing Correlation

In [None]:
cor_num= numeric_data.corr()
cor_num

### For better visualization lets using heatmap

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(30,30))
p = sns.heatmap(cor_num, annot=True, lw=1.5, fmt='.2f', cmap='seismic')
rotxlabel = p.set_xticklabels(p.get_xticklabels(),fontdict={'fontsize':20}, rotation=90)
rotylabel = p.set_yticklabels(p.get_yticklabels(),fontdict={'fontsize':20}, rotation=30)

## Dimentional Reduction of Numerical data

### Removing features with high inter correlation

lets make a copy of the datframe so that we can remove the target feature and then work on dimentional rediction

In [None]:
copy_num= numeric_data.copy()

In [None]:
copy_num= copy_num.drop(columns= 'SalePrice')

In [None]:
copy_cor= copy_num.corr()

In [None]:
High_cor = (copy_cor.abs()).unstack()
sorted_high_cor = High_cor.sort_values(kind="quicksort")

In [None]:
sorted_high_cor[(sorted_high_cor>0.6) & (sorted_high_cor<1)]

### Observation:
- features with >0.6 correaltion among each other and having low or same correlation with the target feature can be dropped for the dataset as it would not have any significant affect on the target feature even after removal

In [None]:
col_drop= ['HalfBath','2ndFlrSF','BsmtFullBath','BedroomAbvGr','GarageYrBlt','1stFlrSF','GarageCars']
numeric_data= numeric_data.drop(columns= col_drop)

### Heat map representation of the correlation with target feature after dimensional reducation 

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(30,30))
p = sns.heatmap(numeric_data.corr(), annot=True, lw=1.5, fmt='.2f', cmap='seismic')
rotxlabel = p.set_xticklabels(p.get_xticklabels(),fontdict={'fontsize':20}, rotation=90)
rotylabel = p.set_yticklabels(p.get_yticklabels(),fontdict={'fontsize':20}, rotation=30)

### As per the above heat map it can be concluded that not all features are strongly correlated to our target feature that is SalePrice and dropping the low correlation features will not make any major impact on our Aanalysis. Now, we just select the features with high corr to our target feature i.e. SalePrice¶

In [None]:
high_Cor_num = numeric_data[ numeric_data.corr().nlargest(10, 'SalePrice')['SalePrice'].index]

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(30,30))
p = sns.heatmap(high_Cor_num.corr(), annot=True, lw=1.5, fmt='.2f', cmap='seismic')
rotxlabel = p.set_xticklabels(p.get_xticklabels(),fontdict={'fontsize':20}, rotation=90)
rotylabel = p.set_yticklabels(p.get_yticklabels(),fontdict={'fontsize':20}, rotation=30)

In [None]:
high_Cor_num.describe()

# Now we can further analyze how are the outstanding features influencing the target feature:

## 1) Analysis of the behaviour of our target feature by using a histogram

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(20,10))
sns.histplot(high_Cor_num.SalePrice)

### Conclusion drawn on basis on 6 point summary and histogram:
- Right skewed graph obtained
- The maximum and minimum prizes of the houses are 755000 and 34900 repectively
- The average price is 180921
- The price of maximum of the houses lie in the range of 13M to 21.4 M approximately
- Very less number of houses have higher end Sale price


## 2) Analysis of the Target feature with respect to the highly correlated feature OverallQual with reference to YearBuilt

In [None]:
np.sort(high_Cor_num.YearBuilt.unique())

In [None]:
high_Cor_num = high_Cor_num .apply(pd.to_numeric)

### Categorized the years for better visualization: below 1900 as 1850 as the min year is 1872 then an increment of 50 years i.e. 1900 1950 2000 

In [None]:
cat_year= high_Cor_num.copy()

In [None]:
cat_year['YearBuilt'] = list(map(lambda x : 1850 if x < 1900 else ( 1900 if x >= 1900 and x < 1950 else (1950 if x >= 1950 and x < 2000 else 2000)),cat_year['YearBuilt']))

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(20,10))
sns.lineplot(data= cat_year,x= 'OverallQual', y='SalePrice',hue= 'YearBuilt',palette='Set1' )

### Conclusions drawn:
- 1850s: range from quality 4 to 10 with price spiking at quality 10 but a depression can be noticed in price at quality 8 and a spike at quality 6, which i think can be analyzed more.
- 1900s: an increase price can be noticed woth increase in quality in houses
- 1950s: a stable increase in price with quality till quality 9 and then a sudden spike makes houses with quality 10 in this year range the most expensive set of housing in the dataset with a price of 700k approx.
- 2000s: the price gradually rises along with the quality and then at quality 9 and 10 houses the price range tends to become stangnant.
- Thus we can conclude that except for houses that were built in 1950s any other 7+ quality house can be availed within a 400k-200k price range 

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(15,10))
sns.scatterplot(data= cat_year,x= 'GrLivArea', y='SalePrice',hue= 'OverallQual' )

### Observations:
 - maximum of the housed have living area within the range of 1000 to 2000 sq ft.
 - the size of the living area along with over all quality of the house seems to be directly proportional to Saleprice
 

In [None]:
plt.subplots(figsize=(15,10))
sns.scatterplot(data= cat_year,x= 'GrLivArea', y='SalePrice',hue= 'YearBuilt' )

### Observations:
- the houses that were made between 1950-2000 seems to be have directly proportional relationship between the living area and   the Saleprice.
- however for the rest of max of the houses built in other year ranges the living area is limited to under 3000 sq ft range. 

### Price range depression in the 1850s: Analysis

In [None]:
num_new = high_Cor_num .loc[(high_Cor_num['YearBuilt']>1850) & (high_Cor_num['YearBuilt']< 1900)&(high_Cor_num['OverallQual']>=6)&(high_Cor_num['OverallQual']<=8)]

In [None]:
def line(var):
    plt.subplots(figsize=(15,10))
    sns.barplot(data= num_new,x= var, y= 'SalePrice',hue='OverallQual',palette='Set1' )

In [None]:
numericVar = ['GrLivArea', 'GarageArea', 'TotalBsmtSF',
       'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea']
for n in numericVar:
    line(n)

### Observations:

Houses built in the year of 1872 and remodelled between 1987-1990 with 2 Bathrooms seems to be have a lesser SalePrice compared to other houses. 

## CATEGORICAL FEATURE ANALYSIS

In [None]:
categorical_data.info()

In [None]:
categorical_data.columns

## Visualization of the categorical feature individually and vs the Target Feature

### Individual Feature

In [None]:
def bar_plot(variable):
      
    # get feature 
    var = categorical_data[variable]
    # count number of feature 
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize=(9,3))
    plt.bar(varValue.index,varValue)
    plt.xticks(varValue.index,varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}:\n{}".format(variable,varValue))

In [None]:
category1 = categorical_data.columns
for c in category1:
    bar_plot(c)

### Categorial Features vs Target Feature

In [None]:
def tar(var):
    plt.subplots(figsize=(15,5))
    sns.barplot(x= categorical_data[var], y= numeric_data['SalePrice'])
    

In [None]:
category = categorical_data.columns
for c in category:
    tar(c)