# 03 - Data Cleaning

## Objectives

Generate a pipeline that performs the data cleaning
* Cleaning: Handle Missing Data
* Cleaning: Encode categorical variables

## Inputs

outputs/datasets/collection/house_prices_records.csv

## Outputs



## Additional Notes
### Data Cleaning - Imputation
##### Target: Missing Values
##### Used Strategies:
- Drop
- Median

# Change working directory

In [None]:
import os
current_dir = os.getcwd()
current_dir

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

In [None]:
current_dir = os.getcwd()
current_dir

# Load Data

In [None]:
import pandas as pd
dataset_raw_path = "outputs/datasets/collection/house_prices_records.csv"
dataset = pd.read_csv(dataset_raw_path)
dataset.head(3)

# Data Cleaning - Encode Categorical Features

- Mapping specific categories of categorical variables to numerical values

In [None]:
dict_for_encoding = {
'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0},
'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0}, 'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0}, 
'KitchenQual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
}


dataset_2=dataset.copy()
for col in dataset.columns[dataset.dtypes=='object'].to_list():
    dataset_2[col] = dataset_2[col].replace(dict_for_encoding[col])
dataset_2.head()

## Create a custom transformer for encoding 

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create the Class
class MyCustomEncoder(BaseEstimator, TransformerMixin):

  def __init__(self, variables, dict_for_encoding):
    if not isinstance(variables, list): 
      self.variables = [variables]
    else: self.variables = variables
    self.dict_for_encoding = dict_for_encoding

  def fit(self, X, y=None):    
    return self

  def transform(self, X):
    for col in self.variables:
      if X[col].dtype == 'object':
        X[col] = X[col].replace(dict_for_encoding[col])
      else:
        print(f"Warning: {col} the Data Type has to be an object")
      
    return X


from sklearn.pipeline import Pipeline
pipeline = Pipeline([('custom_encoder', MyCustomEncoder(variables=['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual'], dict_for_encoding=dict_for_encoding))])

dataset_2 = dataset.copy()
dataset_2 = pipeline.fit_transform(dataset_2)
dataset_2.head(3)

# Data Cleaning - Missing Data

### Custom function to display missing data levels in a DataFrames, it shows:
- Absolute levels
- Relative levels
- Data types

In [None]:
def EvaluateMissingData(dataset):
    missing_data_absolute = dataset.isnull().sum()
    missing_data_percentage = round(missing_data_absolute/len(dataset)*100, 2)
    dataset_missing_data = (pd.DataFrame(
                            data={"RowsWithMissingData": missing_data_absolute,
                                   "PercentageOfDataset": missing_data_percentage,
                                   "DataType": dataset.dtypes}
                                    )
                          .sort_values(by=['PercentageOfDataset'], ascending=False)
                          .query("PercentageOfDataset > 0")
                          )

    return dataset_missing_data

Check missing data levels for the collected dataset.

In [None]:
EvaluateMissingData(dataset)

### Check Features with Missing Values

In [None]:
vars_with_missing_data = dataset.columns[dataset.isna().sum() > 0].to_list()
vars_with_missing_data

## Imputations for Missing Values

#### Imputation - Drop Two Variables with approx. 90% missing values

In [None]:
variables_method = ['WoodDeckSF', 'EnclosedPorch' ]

print(f"* {len(variables_method)} features to drop \n\n"
    f"{variables_method}")

In [None]:
from feature_engine.selection import DropFeatures

drop_features = DropFeatures(features_to_drop=['EnclosedPorch', 'WoodDeckSF'])

dataset_for_drop = drop_features.fit_transform(dataset)
dataset_for_drop.info()

# Check if variables have been dropped
if 'EnclosedPorch' not in dataset_for_drop.columns and 'WoodDeckSF' not in dataset_for_drop.columns:
    print("Variables 'WoodDeckSF' and 'EnclosedPorch' have been successfully dropped.")
else:
    print("Variables could not be dropped.")

In [None]:
# Check the list of features after dropping
EvaluateMissingData(dataset_for_drop)

In [None]:
vars_with_missing_data

### Delete the two dropped features from vars_with_missing_data

In [None]:
from feature_engine.imputation import MeanMedianImputer
vars_with_missing_data = ['2ndFlrSF', 'BedroomAbvGr', 'BsmtFinType1', 'GarageFinish', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea']

pipeline = Pipeline([
      ('drop_features', DropFeatures(features_to_drop = ['EnclosedPorch', 'WoodDeckSF'])),
      ('custom_encoder', MyCustomEncoder(variables=['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual'], dict_for_encoding=dict_for_encoding)),
      ('median_imputer',  MeanMedianImputer(imputation_method='median', variables=vars_with_missing_data))
])

dataset_2 = dataset.copy()
dataset_transformed = pipeline.fit_transform(dataset_2) 
dataset_transformed.head(5)   

## Double-Check Missing Data

In [None]:
vars_with_missing_data = dataset_transformed.columns[dataset_transformed.isna().sum() > 0].to_list()
vars_with_missing_data

## Confirm Pipeline : Median Imputer

In [None]:
pipeline['median_imputer'].imputer_dict_

# Correlation and PPS Analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ppscore as pps


def heatmap_corr(dataset, threshold, figsize=(20, 12), font_annot=8):
    if len(dataset.columns) > 1:
        mask = np.zeros_like(dataset, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        mask[abs(dataset) < threshold] = True

        fig, axes = plt.subplots(figsize=figsize)
        sns.heatmap(dataset, annot=True, xticklabels=True, yticklabels=True,
                    mask=mask, cmap='viridis', annot_kws={"size": font_annot}, ax=axes,
                    linewidth=0.5
                    )
        axes.set_yticklabels(dataset.columns, rotation=0)
        plt.ylim(len(dataset.columns), 0)
        plt.show()


def heatmap_pps(dataset, threshold, figsize=(20, 12), font_annot=8):
    if len(dataset.columns) > 1:
        mask = np.zeros_like(dataset, dtype=np.bool)
        mask[abs(dataset) < threshold] = True
        fig, ax = plt.subplots(figsize=figsize)
        ax = sns.heatmap(dataset, annot=True, xticklabels=True, yticklabels=True,
                         mask=mask, cmap='rocket_r', annot_kws={"size": font_annot},
                         linewidth=0.05, linecolor='grey')
        plt.ylim(len(dataset.columns), 0)
        plt.show()


def CalculateCorrAndPPS(dataset):
    dataset_corr_spearman = dataset.corr(method="spearman")
    dataset_corr_pearson = dataset.corr(method="pearson")

    pps_matrix_raw = pps.matrix(dataset)
    pps_matrix = pps_matrix_raw.filter(['x', 'y', 'ppscore']).pivot(columns='x', index='y', values='ppscore')

    pps_score_stats = pps_matrix_raw.query("ppscore < 1").filter(['ppscore']).describe().T
    print("PPS threshold - check PPS score IQR to decide threshold for heatmap \n")
    print(pps_score_stats.round(3))

    return dataset_corr_pearson, dataset_corr_spearman, pps_matrix


def DisplayCorrAndPPS(dataset_corr_pearson, dataset_corr_spearman, pps_matrix, CorrThreshold, PPS_Threshold,
                      figsize=(20, 12), font_annot=8):

    print("\n")
    print("* Analyse how the target variable for your ML models are correlated with other variables (features and target)")
    print("* Analyse multi-colinearity, that is, how the features are correlated among themselves")

    print("\n")
    print("*** Heatmap: Spearman Correlation ***")
    print("It evaluates monotonic relationship \n")
    heatmap_corr(dataset=dataset_corr_spearman, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

    print("\n")
    print("*** Heatmap: Pearson Correlation ***")
    print("It evaluates the linear relationship between two continuous variables \n")
    heatmap_corr(dataset=dataset_corr_pearson, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

    print("\n")
    print("*** Heatmap: Power Predictive Score (PPS) ***")
    print(f"PPS detects linear or non-linear relationships between two columns.\n"
          f"The score ranges from 0 (no predictive power) to 1 (perfect predictive power) \n")
    heatmap_pps(dataset=pps_matrix, threshold=PPS_Threshold, figsize=figsize, font_annot=font_annot)

#### Calculate Correlations and Power Predictive Score

In [None]:
dataset_corr_pearson, dataset_corr_spearman, pps_matrix = CalculateCorrAndPPS(dataset)

Display at Heatmaps

In [None]:
DisplayCorrAndPPS(dataset_corr_pearson = dataset_corr_pearson,
                  dataset_corr_spearman = dataset_corr_spearman, 
                  pps_matrix = pps_matrix,
                  CorrThreshold = 0.4, PPS_Threshold =0.2,
                  figsize=(12,10), font_annot=10)