# Feasibility study
---
## Housing data - EDA
EDA is not an objective process so be patient. With experience, you will develop sort of your own guide to it. 

## 0- Getting started

### Import the required packages for the notebook

In [None]:
import pandas as pd
import numpy as np

# These are plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

# Without this line magic function, you will not be able to see the plots in the notebook
%matplotlib inline

# This `utils` module is for you to store your frequently used functions.
# from funcs.utils import *

pd.set_option("display.max_columns", 99)

## 1- Read in training data

In [None]:
df_train = pd.read_csv('../input/house-prices-data/train.csv')
df_train

In [None]:
df_train.shape

#### Out of these 81 columns, ✨SalePrice✨  is the value that we need to predict.
---

For the sake of simplicity for now, we will work with only a few columns👇

In [None]:
keep_cols = [
    "Id",
    "SalePrice",
    "OverallQual",
    "GrLivArea",
    "TotalBsmtSF",
    "YrSold",
    "GarageArea",
    "GarageCars",
    "FullBath",
    "1stFlrSF",
    "TotRmsAbvGrd",
    "LandContour",
    "CentralAir",
    "FireplaceQu",
    "PoolQC",
    "BsmtFullBath",
    "BsmtCond",
    "BsmtExposure",
    "LotFrontage",
    "Neighborhood",
    "SaleCondition",
]

df_train = df_train[keep_cols]

In [None]:
df_train.info()

In [None]:
# Define the target feature and the model features
TARGET = "SalePrice"

## 2- Checking our target feature

In [None]:
df_train[TARGET].describe()

No missing values, no abnormal values like negatives or too small values.

In [None]:
# histogram shows the distribution of the target feature
sns.distplot(df_train[TARGET])

In [None]:
# There seem to be outliers so let us zoom in further with a boxplot
sns.boxplot(x=df_train[TARGET])

## 3- Its relationship with the numerical features

In [None]:
df_train.select_dtypes(include=['int','float']).columns.to_list()

In [None]:
df_train.info()

In [None]:
NUMERICAL_FEATURES = df_train.select_dtypes(include=["int", "float"]).columns.to_list()
NUMERICAL_FEATURES.remove(TARGET)
NUMERICAL_FEATURES

In [None]:
# Let us check out how they are ditributed - good idea to know this about all your features
sns.distplot(df_train['GrLivArea'])

In [None]:
df_train[NUMERICAL_FEATURES].describe()

---

In [None]:
#@title
print(df_train['OverallQual'].value_counts())

#Yr - Does the ratio between 2 years make any sense? 

#Id

# NUMERICAL_FEATURES.remove('Id')
NUMERICAL_FEATURES.remove('OverallQual')
NUMERICAL_FEATURES.remove('YrSold')

CAT_FEATURES = ['OverallQual', 'YrSold']

---

In [None]:
var = "GrLivArea"  # Above grade (ground) living area square feet
sns.relplot(x=var, y=TARGET, data=df_train)

In [None]:
var = "TotalBsmtSF"  # Total square feet of basement area
sns.relplot(x=var, y=TARGET, data=df_train)

In [None]:
# Total square feet of basement area
plt.scatter(x=df_train["TotalBsmtSF"], y=df_train[TARGET])
plt.scatter(x=df_train["GrLivArea"], y=df_train[TARGET], color='orange')

## 4- Its relationship with categorical features

In [None]:
BOOL_FEATURES = df_train.select_dtypes(include=["bool"]).columns.to_list()
CAT_FEATURES = df_train.select_dtypes(include=["object"]).columns.to_list() + CAT_FEATURES

print(BOOL_FEATURES)
print(f"\n And here are the categorical features:\n {CAT_FEATURES}")

In [None]:
# Checking the cardinality
df_train[CAT_FEATURES].nunique()

In [None]:
# What kind of values do they hold? - This is not distinct count.
sns.countplot(x='LandContour', data=df_train)

---
### I. Distribution plots

In [None]:
var = "LandContour"
sns.boxplot(x=var, y=TARGET, data=df_train, order=["Bnk", "Low", "Lvl", "HLS"])

       Lvl	Near Flat/Level	
       Bnk	Banked - Quick and significant rise from street grade to building
       HLS	Hillside - Significant slope from side to side
       Low	Depression

In [None]:
var = "LandContour"
sns.violinplot(
    x=var,
    y=TARGET,
    data=df_train,
    inner="box", #to add boxplot inside the violin plot so that you can see both together
    order=["Bnk", "Low", "Lvl", "HLS"],
)

In [None]:
#Let us add one more dimension to the above plot to see two categorical features together
var = "LandContour"
sns.violinplot(
    x=var,
    y=TARGET,
    hue="CentralAir",
    data=df_train,
    inner="quartile",
    split=True,
    order=["Bnk", "Low", "Lvl", "HLS"],
)

In [None]:
var = "YrSold"
plt.figure(figsize=(20, 6))
sns.boxplot(x=var, y=TARGET, data=df_train)

plt.xticks(rotation=90)

---
### II. See individual observations

In [None]:
var = "LandContour"
sns.relplot(x="GrLivArea", y=TARGET, hue=var, data=df_train)

In [None]:
var = "SaleCondition"
# sns.stripplot(x=var, y=TARGET, data=df_train)
sns.swarmplot(x=var, y=TARGET, data=df_train)

---
### III. Mean/median plots

In [None]:
var = "YrSold"
plt.figure(figsize=(20, 6))
sns.pointplot(x=var, y=TARGET, data=df_train, estimator=np.mean) 
plt.xticks(rotation=90)


Good to know why prices dropped in 2008. This kind of analysis can lead to additional valuable insights for your project.

### Summary of plots
1. **Distribution plots:** Box and violin. <br/>
    Use these additional args for violin: <br/>
    *inner='box' or 'quartile',<br/>
    split :bool -> When using hue nesting with a variable that takes two levels, setting split to True will draw half of a violin for each level. This can make it easier to directly compare the distributions.*<br/>
3. **See individual observations:** Strip, swarm. (Unlike strip plots, swarm plots attempt to avoid obscuring points by calculating non-overlapping positions instead of adding random jitter.)<br/>
2. **Compare mean or median:** Bar or point plot. Use estimator arg to change from mean to median. <br/>
TIP: Use these two in combination<br/>

---

In [None]:
var = "Neighborhood"
# Add your barplot here
# sns.set_theme(style="whitegrid")
# sns.barplot(x=var, y=TARGET, data=df_train)
#Meanplot
plt.figure(figsize=(20, 6))
sns.pointplot(x=var, y=TARGET, data=df_train, estimator=np.mean) 
plt.xticks(rotation=90)

In [None]:
var = "LandContour"
sns.boxplot(x=var, y=TARGET, data=df_train, order=["Bnk", "Low", "Lvl", "HLS"])

In [None]:
var = "SaleCondition"
# Add your barplot here
sns.barplot(x=var, y=TARGET, data=df_train)

In [None]:
var = "BsmtCond"
sns.boxplot(x=var, y=TARGET, data=df_train)

In [None]:
var = "BsmtCond"
# Add your barplot here sns.set_theme(style="whitegrid")
sns.barplot(x=var, y=TARGET, data=df_train)

In [None]:
var = "CentralAir"
# Add your barplot here sns.set_theme(style="whitegrid")
sns.barplot(x=var, y=TARGET, data=df_train)

In [None]:
var = "FireplaceQu"
# Add your barplot here sns.set_theme(style="whitegrid")
sns.barplot(x=var, y=TARGET, data=df_train)

---

## 5- Let us get objective and delve into more complex relationships

Pearson correlation is a statistic that measures linear correlation between two variables X and Y. It has a value between +1 and −1. A value of +1 is total positive linear correlation, 0 is no linear correlation, and −1 is total negative linear correlation.
[Read more](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient)

In [None]:
# correlation matrix
corrmat = df_train.drop(columns=['Id']).corr()
plt.figure(figsize=(20,10))
sns.heatmap(corrmat, vmax=0.8, square=True, annot=True)

In [None]:
# This plot takes time to run so keep only the required features.
sns.set()
cols = NUMERICAL_FEATURES + [TARGET]
sns.pairplot(df_train[cols], height=2.5)
plt.show()