# Objectives of EDA

In [None]:
import warnings
warnings.filterwarnings('ignore')
%autosave 5

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Import data
housing = pd.read_csv('house.csv')
housing.info()

In [None]:
# Get list of numerical variables
print(list(housing.select_dtypes('number').columns))

In [None]:
# Get list of categorical variables
print(list(housing.select_dtypes('object').columns))

In [None]:
# Redefine DataFrame to include selected variables
numerical = [
    'SalePrice',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    '1stFlrSF',
    '2ndFlrSF',
    'BedroomAbvGr'
]

categorical = [
    'MSZoning',
    'LotShape',
    'Neighborhood',
    'CentralAir',
    'SaleCondition',
    'MoSold',
    'YrSold'
]

housing = housing[numerical + categorical]
housing.shape

# Understanding variables with `seaborn`
## Target variable

In [None]:
# Get summary statistics for SalePrice
housing['SalePrice'].describe()

In [None]:
# Plot distribution of SalePrice using matplotlib
plt.figure(figsize=(8,5))
plt.hist(housing['SalePrice'], bins=20)
plt.xlabel('Sale Price')
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
# Plot distribution of SalePrice using seaborn
import seaborn as sns

sns.set(style='whitegrid',               # changes style (white background with grid)
        palette="deep",                  # changes color palette
        font_scale=1.1,                  # increases font size
        rc={"figure.figsize": [8, 5]})   # sets figure size

sns.distplot(
    housing['SalePrice'], norm_hist=False, kde=False, bins=20, hist_kws={"alpha": 1}
).set(xlabel='Sale Price', ylabel='Count')
plt.show()

## Numerical variables

In [None]:
# Plot numerical variables using pandas + matplotlib
housing[numerical].hist(bins=15, figsize=(15, 6), layout=(2, 4))
plt.tight_layout()
plt.show()

In [None]:
# Create 'Age' variable and replace 'YearBuilt' with it
housing['Age'] = housing['YrSold'] - housing['YearBuilt']
numerical.remove('YearBuilt')
numerical.append('Age')

# Replot numerical variables
housing[numerical].hist(bins=15, figsize=(15, 6), layout=(2, 4))
plt.tight_layout()
plt.show()

## Categorical variables

In [None]:
# Plot distribution of SaleCondition using matplotlib
housing['SaleCondition'].value_counts().plot(kind='bar', title='SaleCondition')
plt.show()

In [None]:
# Plot distribution of SaleCondition using seaborn
sns.countplot(housing['SaleCondition'])
plt.show()

In [None]:
# Plot categorical variables using matplotlib + seaborn
fig, ax = plt.subplots(2, 4, figsize=(20, 10))
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(housing[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

# Since there are only 7 variables and 8 plots, we will remove this plot's axis
# You can remove this line to see how the figure would look otherwise
ax[-1,-1].axis('off')

fig.tight_layout()
plt.show()

In [None]:
# Write function that returns list of categories with more than 30 observations
def above30(series):
    counts = series.value_counts()
    return list(counts[counts >= 30].index)

In [None]:
# Apply function to each categorical variable
categories_to_keep = housing[categorical].apply(above30, axis=0)
print(categories_to_keep)

# Loop through categorical variables to keep only categories with more than 30 observations
for variables in categorical:
    housing = housing.loc[housing[variables].isin(categories_to_keep[variables])]

In [None]:
# Print shape of new DataFrame
housing.shape

In [None]:
# Plot categorical variables using matplotlib + seaborn
fig, ax = plt.subplots(2, 4, figsize=(20, 10))
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(housing[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

# Since there are only 7 variables and 8 plots, we will remove this plot's axis
# You can remove this line to see how the figure would look otherwise
ax[-1,-1].axis('off')

fig.tight_layout()
plt.show()

# Relationships between variables
## Scatter plots

In [None]:
# Plot relationship between 1stFlrSF and SalePrice using matplotlib
plt.scatter(x=housing['1stFlrSF'], y=housing['SalePrice'])
plt.xlabel('1stFlrSF')
plt.ylabel('SalePrice')
plt.show()

In [None]:
# Plot relationship between 1stFlrSF and SalePrice using seaborn
sns.scatterplot(x=housing['1stFlrSF'], y=housing['SalePrice'])
plt.show()

In [None]:
# Plot scatter + marginal plot between 1stFlrSF and SalePrice
sns.jointplot(x=housing['1stFlrSF'], y=housing['SalePrice'])
plt.show()

In [None]:
# Plot scatters between SalePrice, LotArea, OverallQual, and OverallCond
sns.pairplot(housing[numerical[:4]])
plt.show()

In [None]:
# Plot scatters between SalePrice, 1stFlrSF, 2ndFlrSF, BedroomAbvGr, and Age
sns.pairplot(housing[['SalePrice'] + numerical[4:]])
plt.show()

## Box plots

In [None]:
# Plot box plots between SalePrice and categorical variables
fig, ax = plt.subplots(3, 3, figsize=(15, 10))
for var, subplot in zip(categorical, ax.flatten()):
    sns.boxplot(x=var, y='SalePrice', data=housing, ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

# Since there are only 7 variables and 9 plots, we will remove these plots' axis
ax[-1,-1].axis('off')
ax[-1,-2].axis('off')
        
fig.tight_layout()
plt.show()

In [None]:
# Sort neighborhoods by median SalePrice from lowest to highest
sorted_nb = housing.groupby(['Neighborhood'])['SalePrice'].median().sort_values()
print(sorted_nb)

In [None]:
# Create sorted boxplot between Neighborhood and SalePrice
sns.boxplot(x=housing['Neighborhood'], y=housing['SalePrice'], order=list(sorted_nb.index))
plt.xticks(rotation=90)
plt.show()

## Conditional plots

In [None]:
## Reset defaults to make font smaller, figure bigger, and remove grid
sns.set(style='white', palette="deep", font_scale=0.9, rc={"figure.figsize": [20, 10]})

# Create scatter plot between OverallQual and SalePrice, conditioning on Neighborhood
cond_plot = sns.FacetGrid(data=housing, col='Neighborhood', col_wrap=4)
cond_plot.map(sns.scatterplot, 'OverallQual', 'SalePrice')
plt.show()

In [None]:
# Create scatter plot between Age and SalePrice, conditioning on YrSold and SaleCondition
cond_plot = sns.FacetGrid(data=housing, col='YrSold', row='SaleCondition', hue='CentralAir')
cond_plot.map(sns.scatterplot, 'Age', 'SalePrice').add_legend()
plt.tight_layout()
plt.show()

## TASK:

In [None]:
# TASK --- Create a complex conditional plot
