In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy import linalg
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, chi2
from sklearn.covariance import MinCovDet
from pandas.api.types import is_numeric_dtype
import utilities as utils

In [None]:
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12,8)
warnings.filterwarnings('ignore')

In [None]:
trainPath = 'train.csv'
target = 'SalePrice'
outdir = 'plots'

In [None]:
X = pd.read_csv(trainPath, index_col='Id')

In [None]:
utils.plotAllFeatures(X, target, outdir=outdir)

### Correlation analysis
- Correlate each numerical feature with the predictor.
- Save p-value, N and correlation coefficient for later.

In [None]:
print(correlations.loc[
    (correlations['feature1'] != target) 
    & (correlations['feature2'] != target)
    & (correlations['feature1'] != correlations['feature2'])
    & (abs(correlations['R'] > 0.6))])

In [None]:
correlations = utils.computeCorrelation(X)
utils.plotPairwiseCorrelation(
    correlations, out=f'{outdir}/pairwiseCorrelations.png')

In [None]:
fig, (ax1, ax2) = utils.plotTargetCorrelation(
    correlations, target, out = f'{outdir}/{target}-correlations.png')

### Missing data assessment
- Compute NA frequency for each feature

In [None]:
nullOrd = X.isnull().sum().sort_values(ascending=False)
nullOrd = nullOrd[nullOrd > 0]
nullOrd = pd.DataFrame(nullOrd, columns=['TotalNA'])
nullOrd['PropNA'] = nullOrd['TotalNA'] / len(X)
nullOrd

In [None]:
fix, ax = plt.subplots()
sns.heatmap(X[nullOrd.index].isnull().T, yticklabels=1, cmap='viridis', cbar=False, xticklabels=False, ax=ax)
ax.set_xlabel('')
ax.tick_params(left=True)
fig.tight_layout()

## Missing values

### High proportion NA features
- Remove all features with > 15% missing values.
- No strong relationship with SalePrice anyway.

In [None]:
# Processing
X['Alley'] = X['Alley'].fillna('No alley access')
X['BsmtQual'] = X['BsmtQual'].fillna('No Basement')
X['BsmtCond'] = X['BsmtCond'].fillna('No Basement')
X['BsmtExposure'] = X['BsmtExposure'].fillna('No Basement')
X['BsmtFinType1'] = X['BsmtFinType1'].fillna('No Basement')
X['BsmtFinType2'] = X['BsmtFinType2'].fillna('No Basement')
X['BsmtFullBath'] = X['BsmtFullBath'].fillna(0)
X['BsmtHalfBath'] = X['BsmtHalfBath'].fillna(0)
X['BsmtUnfSF'] = X['BsmtUnfSF'].fillna(0)
X['Fence'] = X['Fence'].fillna('No Fence' )
X['FireplaceQu'] = X['FireplaceQu'].fillna('No Fireplace')
X['Fireplaces'] = X['Fireplaces'].fillna(0)
X['MiscFeature'] = X['MiscFeature'].fillna('No misc feature')
X['MiscVal'] = X['MiscVal'].fillna(0)
X['Functional'] = X['Functional'].fillna('Typ')
X['GarageType'] = X['GarageType'].fillna('No garage')
X['GarageFinish'] = X['GarageFinish'].fillna('No garage')
X['GarageQual'] = X['GarageQual'].fillna('No garage')
X['GarageCond'] = X['GarageCond'].fillna('No garage')
X['GarageArea'] = X['GarageArea'].fillna(0)
X['GarageCars'] = X['GarageCars'].fillna(0)
X['PoolQC'] = X['PoolQC'].fillna('No pool')
X['PoolArea'] = X['PoolArea'].fillna(0)

### Save processed
- Remerge features and predictor.
- Ensure to filter predictor by remaining indexes in feature in case rows have been filtered.

In [None]:
X.to_csv('trainFilt.csv')

## Inspecting categoricals

In [None]:
catGroups = X.select_dtypes(exclude=['number'])
catDescribe = catGroups.describe().sort_values(by=['unique'], ascending=False, axis=1).T
catSmallest = catGroups.melt().groupby('variable').apply(lambda x: x.value_counts().min()).rename('smallestGroup')
catDescribe = pd.merge(catDescribe, catSmallest,
                       left_index=True, right_index=True)
catDescribe

In [None]:
X['Neighborhood'] = X['Neighborhood'].replace(countsSum[countsSum < 0.25].index, 'other')