# **Data Cleaning Notebook**

## Objectives

- Assess and quantify missing data.
- Perform data cleaning to prepare for analysis and modeling.

## Inputs

- **Dataset**: outputs/datasets/collection/house_prices_records.csv

## Outputs

- Cleaned datasets ready for training and testing models.


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Load Data

In [None]:
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/house_prices_records.csv")
    )
df.head()

---

# Data Exploration

## Profile Report

We will explore the dataset for missing values. The code below will analyze the distribution and shape of a variable with missing data.

In [None]:
vars_with_missing_data = df.columns[df.isna().sum() > 0].to_list()
vars_with_missing_data

Now that we"ve isolated the variable with missing data, let's generate a profile report to further investigate the characteristics of this missing data.

In [None]:
from pandas_profiling import ProfileReport
if vars_with_missing_data:
    profile = ProfileReport(df=df[vars_with_missing_data], minimal=True)
    profile.to_notebook_iframe()
else:
    print("There are no variables with missing data")

## Correlation and PPS Analysis

We will conduct another round of correlation and Power Predictive Score (PPS) analysis to further explore and understand the relationships between the variables.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ppscore as pps

def heatmap_corr(df,threshold, figsize=(20,12), font_annot = 8):
  if len(df.columns) > 1:
    mask = np.zeros_like(df, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    mask[abs(df) < threshold] = True

    fig, axes = plt.subplots(figsize=figsize)
    sns.heatmap(df, annot=True, xticklabels=True, yticklabels=True,
                mask=mask, cmap="viridis", annot_kws={"size": font_annot}, ax=axes,
                linewidth=0.5
                     )
    axes.set_yticklabels(df.columns, rotation = 0)
    plt.ylim(len(df.columns),0)
    plt.show()


def heatmap_pps(df,threshold, figsize=(20,12), font_annot = 8):
    if len(df.columns) > 1:

      mask = np.zeros_like(df, dtype=np.bool)
      mask[abs(df) < threshold] = True

      fig, ax = plt.subplots(figsize=figsize)
      ax = sns.heatmap(df, annot=True, xticklabels=True,yticklabels=True,
                       mask=mask,cmap="rocket_r", annot_kws={"size": font_annot},
                       linewidth=0.05,linecolor="grey")
      
      plt.ylim(len(df.columns),0)
      plt.show()



def CalculateCorrAndPPS(df):
  df_corr_spearman = df.corr(method="spearman")
  df_corr_pearson = df.corr(method="pearson")

  pps_matrix_raw = pps.matrix(df)
  pps_matrix = pps_matrix_raw.filter(["x", "y", "ppscore"]).pivot(columns="x", index="y", values="ppscore")

  pps_score_stats = pps_matrix_raw.query("ppscore < 1").filter(["ppscore"]).describe().T
  print("PPS threshold - check PPS score IQR to decide the threshold for the heatmap \n")
  print(pps_score_stats.round(3))

  return df_corr_pearson, df_corr_spearman, pps_matrix


def DisplayCorrAndPPS(df_corr_pearson, df_corr_spearman, pps_matrix,CorrThreshold,PPS_Threshold,
                      figsize=(20,12), font_annot=8 ):

  print("\n")
  print("* Analyze how the target variable for your ML models are correlated with other variables (features and target)")
  print("* Analyze multi colinearity, that is, how the features are correlated among themselves")

  print("\n")
  print("*** Heatmap: Spearman Correlation ***")
  print("It evaluates monotonic relationship \n")
  heatmap_corr(df=df_corr_spearman, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

  print("\n")
  print("*** Heatmap: Pearson Correlation ***")
  print("It evaluates the linear relationship between two continuous variables \n")
  heatmap_corr(df=df_corr_pearson, threshold=CorrThreshold, figsize=figsize, font_annot=font_annot)

  print("\n")
  print("*** Heatmap: Predictive power Score (PPS) ***")
  print(f"PPS detects linear or non-linear relationships between two columns.\n"
        f"The score ranges from 0 (no predictive power) to 1 (perfect predictive power) \n")
  heatmap_pps(df=pps_matrix,threshold=PPS_Threshold, figsize=figsize, font_annot=font_annot)

In [None]:
df_corr_pearson, df_corr_spearman, pps_matrix = CalculateCorrAndPPS(df)

In [None]:
DisplayCorrAndPPS(df_corr_pearson=df_corr_pearson,
                  df_corr_spearman=df_corr_spearman, 
                  pps_matrix=pps_matrix,
                  CorrThreshold=0.4, PPS_Threshold=0.15,
                  figsize=(10,10), font_annot=8)

## Missing Values Analysis

We will conduct a concise evaluation of missing data, providing a shorter overview than the extensive report generated by pandas profiling.

In [None]:
def EvaluateMissingData(df):
    missing_data_absolute = df.isnull().sum()
    missing_data_percentage = round(missing_data_absolute / len(df) * 100, 2)
    df_missing_data = (pd.DataFrame(
        data={
            "RowsWithMissingData": missing_data_absolute,
            "PercentageOfDataset": missing_data_percentage,
            "DataType": df.dtypes
        })
        .sort_values(by=["PercentageOfDataset"], ascending=False)
        .query("PercentageOfDataset > 0")
    )

    if df_missing_data.empty:
        print("There are no variables with missing data")
    else:
        return df_missing_data

In [None]:
EvaluateMissingData(df)

Here"s a concise summary of the strategies for handling missing data for each variable in order of appearance:

- **EnclosedPorch** - (90.68% missing data): <br>
This variable will be dropped due to a high percentage of missing data.

- **WoodDeckSF** - (89.38% missing data): <br>
Similarly, this variable will be dropped due to the extensive missing data.

- **LotFrontage** - (17.74% missing data):
    - Exhibits a moderate correlation with Sale Price but lacks predictive power.
    - Imputation with the median (69) is considered, as 0 is not a logical fill for property street frontage. The mean is 70, showing a right skew.

- **GarageFinish** - (11.10% missing data):
    - This categorical variable includes values: Unfinished (unf), Rough Finished (rfn), Finished (fin), and None.
    - Missing values will be checked against garage area; "None" will be imputed if no garage area is present, otherwise most likely "Unfinished".

- **BsmtFinType1** - (7.81% missing data):
    - Similar to the approach for GarageFinish, inspect properties with missing values to determine if there is any basement area, and assign "None" if applicable.
    - Additional analysis will be necessary to address this variable further.

- **BedroomAbvGr** - (6.78% missing data):
    - Only 6 entries, representing 0.4% of the dataset, have a value of 0 for bedrooms above grade. This suggests that the missing data is unlikely to be attributable to the absence of bedrooms above grade.
    - Median and mean values are 3 and 2.9 respectively.

- **2ndFlrSF** - (5.89% missing data):
    - Zero can be assigned to properties with no second floor, as 53.5% of the dataset already has a value of 0 for this variable.

- **GarageYrBlt** - (5.55% missing data):
    - Highly correlated with the year built, suggesting it could often be the same as the construction year.
    - Missing values might be substituted with the year built where logical.
    - If both GarageFinish and GarageArea have a value of "None" and "0" respectively, it is likely that the property does not have a garage.

- **MasVnrArea** - (0.55% missing data):
    - The absence of values for MasVnrArea could indicate that there is no masonry veneer, especially since 59% of the values are 0, which is also the median value.

# Data Cleaning

## Create a Duplicate Dataset for Cleaning

First, we"ll create a copy of the dataset and apply the cleaning procedures to this copy:

In [None]:
df_cleaned = df.copy()
df_cleaned

## Handling Missing Data

In this section, we will examine and address missing data by analyzing its relationship with other relevant variables and making appropriate adjustments.

### **EnclosedPorch** and **WoodDeckSF**

As mentioned in the initial analysis, a significant amount of data for these variables is missing (90.68% and 89.38% respectively). Therefore we will drop these variables.

In [None]:
# Drop specified columns from the cleaned dataframe
df_cleaned = df_cleaned.drop(columns=["EnclosedPorch", "WoodDeckSF"])

We verify that the specified variables have been successfully removed from the dataframe with the code below:

In [None]:
# Print all variables with missing data to verify that the specified variables have been successfully removed
EvaluateMissingData(df_cleaned)

As illustrated, the variables **EnclosedPorch** and **WoodDeckSF** no longer appear in the list of variables with missing data, confirming their successful removal from the dataset.

### **LotFrontage**

**LotFrontage** measures the linear feet of street frontage connected to a property, and logically, this value cannot be zero. A review of the dataset confirms there are no zero values.

The following code block is designed to identify and count the rows where **LotFrontage** is missing in the dataset:

In [None]:
missing_lotfrontage = df_cleaned[df_cleaned["LotFrontage"].isnull()]
print(f"Amount of rows with missing data: {len(missing_lotfrontage)}")
missing_lotfrontage

The right-skewed distribution shown in the histogram from the Pandas Profile Report suggests that using the median for imputation is more appropriate than the mean.

In [None]:
# Calculate the median value of "LotFrontage" from the cleaned data
median_lot_frontage = df_cleaned["LotFrontage"].median()
print(f"Median value is: {median_lot_frontage}")

# Fill missing values in "LotFrontage" with the median value
df_cleaned["LotFrontage"].fillna(median_lot_frontage, inplace=True)
print("Filled the missing data with the median value")

# Verify the changes by checking for any remaining null values in the "LotFrontage" column
print(f'Variables with missing "Lot Frontage" value: {df_cleaned["LotFrontage"].isnull().sum()}')

### **GarageFinish**

**GarageFinish** refers to the interior finish of the garage, categorized by values such as "Unf" (Unfinished), "RFn" (Rough Finished), "Fin" (Finished), and "None" (No Garage).

The following code block is designed to identify and count the rows where **GarageFinish** is missing in the dataset:

In [None]:
missing_garage_finish = df_cleaned[df_cleaned["GarageFinish"].isnull()]
print(f"Amount of rows with missing data: {len(missing_garage_finish)}")
missing_garage_finish[["GarageFinish", "GarageYrBlt", "GarageArea"]]

We will initially inspect properties to identify those that lack a garage, indicated by a **GarageArea** of 0. <br>

In [None]:
df_no_garage_area = df_cleaned[(df_cleaned["GarageFinish"].isnull()) & (df["GarageArea"] == 0)]
print(f"Amount of rows with missing data: {len(df_no_garage_area)}")
df_no_garage_area[["GarageFinish", "GarageYrBlt", "GarageArea"]]

For these properties, we will impute the **GarageFinish** attribute as "None".

In [None]:
# Count initial number of properties without a garage ("GarageArea" is 0) and missing "GarageFinish"
initial_count = df_cleaned[(df_cleaned["GarageArea"] == 0) & (df_cleaned["GarageFinish"].isnull())].shape[0]
print(f'Initial number of rows with "GarageFinish" missing and no "GarageArea" value: {initial_count}')

# Assign the value "None" to "GarageFinish" for properties where "GarageArea" is 0
df_cleaned.loc[(df_cleaned["GarageArea"] == 0) & (df_cleaned["GarageFinish"].isnull()), "GarageFinish"] = "None"

# Count the number of properties that still have missing "GarageFinish" after the update
remaining_count = df_cleaned[(df_cleaned["GarageArea"] == 0) & (df_cleaned["GarageFinish"].isnull())].shape[0]
print(f'Remaining number of rows with "GarageFinish" missing and no "GarageArea" value after update: {remaining_count}')

For the remaining properties that have a specified **GarageArea** but missing **GarageFinish**, we will impute the value "Unf", which is the most frequently occurring category for this variable.

In [None]:
# Display the amount of rows with missing "GarageFinish" values
print(f'Initial number of rows with "GarageFinish" missing: {df_cleaned["GarageFinish"].isnull().sum()}')

# Impute missing "GarageFinish" values with "Unf" (Unfinished)
df_cleaned.loc[(df_cleaned["GarageFinish"].isnull()), "GarageFinish"] = "Unf"

# Check and display the number of remaining missing entries in "GarageFinish" after imputation
print(f'Remaining missing "GarageFinish" entries: {df_cleaned["GarageFinish"].isnull().sum()}')

### **BsmtFinType1**

**BsmtFinType1** refers to the quality of the basement finish, categorized by values such as "GLQ" (Good Living Quarters), "ALQ" (Average Living Quarters), "BLQ" (Below Average Living Quarters), "Rec" (Average Recreational Room), "LwQ" (Low Quality), "Unf" (Unfinished), and "None" (No Basement).


The following code block is designed to identify and count the rows where **BsmtFinType1** is missing in the dataset:

In [None]:
# Identifying and counting the  missing entries for the "BsmtFinType1" column in the cleaned DataFrame
missing_bsmtfin_type1 = df_cleaned[df_cleaned["BsmtFinType1"].isnull()]

# Printing the number of rows with missing "BsmtFinType1" data
print(f"Amount of rows with missing data: {len(missing_bsmtfin_type1)}")

# Displaying specific columns of interest from rows with missing "BsmtFinType1"
missing_bsmtfin_type1[["BsmtFinType1", "TotalBsmtSF", "BsmtUnfSF", "BsmtFinSF1"]]

Evaluate the missing values for **BsmtFinType1** where **TotalBsmtSF** is zero. Properties with a basement square footage of zero indicate no basement present, which justifies setting **BsmtFinType1** to "None".

In [None]:
# Identifying and counting the missing entries for the "BsmtFinType1" column in the cleaned DataFrame with a "TotalBsmtSF" value of 0
df_basement_none = df_cleaned[(df["BsmtFinType1"].isnull()) & (df["TotalBsmtSF"] == 0)]

# Printing the number of rows
print(f"Amount of rows with missing data: {len(df_basement_none)}")
df_basement_none[["BsmtFinType1", "TotalBsmtSF", "BsmtUnfSF", "BsmtFinSF1", "BsmtExposure"]]

The rows above display zero values for finished, unfinished, and total basement area, and also lack **BsmtExposure**. This confirms the absence of a basement, allowing us to confidently impute "None" for **BsmtFinType1**.

Let's proceed to impute these values now:

In [None]:
# Update "BsmtFinType1" to "None" for rows where there is no basement ("TotalBsmtSF" is 0)
df_cleaned.loc[(df_cleaned["TotalBsmtSF"] == 0) & (df_cleaned["BsmtFinType1"].isnull()), "BsmtFinType1"] = "None"

# Display the remaining number of rows that still have missing "BsmtFinType1" data
print(f'Amount of rows left with missing "BsmtFinType1" data: {df_cleaned["BsmtFinType1"].isnull().sum()}')

There are 108 rows remaining with missing **BsmtFinType1** data.

The values belows indicate that the variable **BsmtUnfSF** has 0 square feet unfinished, and since the **TotalBsmtSF** is greater than 0, it suggests the basement is finished but no category has been assigned.

As we cannot deduce if it is a rec room/living quarter and the quality, we should create a new category: "Finished".

In [None]:
# Rows where "BsmtFinType1' is missing and 'BsmtUnfSF' equals 0
df_basement_finished = df_cleaned[(df_cleaned["BsmtFinType1"].isnull()) & (df["BsmtUnfSF"] == 0)]

# Printing the count of rows where basement finishing type is missing but there is no unfinished space
print(f"Amount of rows with missing data: {len(df_basement_finished)}")

# Displaying the selected columns for the filtered rows
df_basement_finished[["BsmtFinType1", "TotalBsmtSF", "BsmtUnfSF", "BsmtFinSF1"]]

Let's proceed to impute "Fin" value now to these rows:

In [None]:
# Update "BsmtFinType1" to "Fin" for rows where there is a value of "0" for "BsmtUnfSF"
df_cleaned.loc[(df_cleaned["BsmtFinType1"].isnull()) & (df["BsmtUnfSF"] == 0), "BsmtFinType1"] = "Fin"

# Display the remaining number of rows that still have missing 'BsmtFinType1' data
print(f'Amount of rows left with missing "BsmtFinType1" data: {df_cleaned["BsmtFinType1"].isnull().sum()}')

There are still 102 rows missing data for **BsmtFinType1**.

In cases where the unfinished basement square footage (**BsmtUnfSF**) is greater than 0, we can reasonably impute these entries as 'Unfinished'.

In [None]:
# Select rows where "BsmtFinType1" is missing and "BsmtUnfSF" is greater than 0
df_basement_unfinished = df_cleaned[(df_cleaned["BsmtFinType1"].isnull()) & (df["BsmtUnfSF"] > 0)]

# Print the number of rows found with these conditions
print(f"Amount of rows with missing data: {len(df_basement_unfinished)}")

# Display the relevant columns for these rows to review the data
df_basement_unfinished[["BsmtFinType1", "TotalBsmtSF", "BsmtUnfSF", "BsmtFinSF1"]]

The DataFrame above indicates that all remaining entries with missing data have a **BsmtUnfSF** (unfinished basement square footage) value greater than 0.

This suggests that these basements are indeed unfinished. We can confidently impute the category "Unfinished" to all these instances.

In [None]:
# Update "BsmtFinType1" to "Unf" for rows where there is a value greater than "0" for "BsmtUnfSF"
df_cleaned.loc[(df_cleaned["BsmtFinType1"].isnull()) & (df["BsmtUnfSF"] > 0), "BsmtFinType1"] = "Unf"

# Print the count of remaining missing values in "BsmtFinType1" to verify all necessary changes were applied
print(f'Amount of rows left with missing "BsmtFinType1" data: {df_cleaned["BsmtFinType1"].isnull().sum()}')

We now perform a final check to ensure there are no remaining rows with missing data in the **BsmtFinType1** column.

In [None]:
# Identifying and counting the remaining missing entries for the "BsmtFinType1" column in the cleaned DataFrame
missing_bsmtfin_type1 = df_cleaned[df_cleaned["BsmtFinType1"].isnull()]

# Printing the number of remaining rows with missing "BsmtFinType1" data
print(f"Amount of rows with missing data: {len(missing_bsmtfin_type1)}")
missing_bsmtfin_type1[["BsmtFinType1", "TotalBsmtSF", "BsmtUnfSF", "BsmtFinSF1"]]

### **BedroomAbvGr**

**BedroomAbvGr** refers to the number of bedrooms above the ground level. This numeric variable quantifies the number of bedrooms in a property, excluding any potential basement bedrooms.

The following code block is designed to identify and count the rows where **BedroomAbvGr** is missing in the dataset:

In [None]:
# Identify rows in the cleaned DataFrame where "BedroomAbvGr" is missing
missing_bedroom_abv_gr = df_cleaned[df_cleaned["BedroomAbvGr"].isnull()]

# Output the number of rows where "BedroomAbvGr" is missing
print(f"Amount of rows with missing data: {len(missing_bedroom_abv_gr)}")
missing_bedroom_abv_gr

As indicated by our initial analysis, only 0.6% of the values for **BedroomAbvGr** are recorded as 0, suggesting that instances of zero bedrooms are rare. 

Given this distribution, it is reasonable to fill missing values with the median, which is 3, as this represents a typical configuration for properties in this dataset.

In [None]:
# Calculate the median value of "BedroomAbvGr" from the cleaned data
median_bedroom_abv_gr = df_cleaned["BedroomAbvGr"].median()

# Print the median value used for imputation
print(f"Median number of bedrooms above grade used for imputation: {median_bedroom_abv_gr}")

# Print the amount of missing data in "BedroomAbvGr" before imputation
print(f'Amount of rows with missing "BedroomAbvGr" data before imputation: {df_cleaned["BedroomAbvGr"].isnull().sum()}')

# Impute missing values with the calculated median
df_cleaned["BedroomAbvGr"].fillna(median_bedroom_abv_gr, inplace=True)

# Re-check and print the amount of missing data in "BedroomAbvGr" to ensure no missing values remain
missing_after_imputation = df_cleaned["BedroomAbvGr"].isnull().sum()
print(f'Amount of rows with missing "BedroomAbvGr" data after imputation: {df_cleaned["BedroomAbvGr"].isnull().sum()}')


### **2ndFlrSF**

**2ndFlrSF** refers to the square footage of the second floor of the house. This variable quantifies the area of any living space above the first floor.

The following code block is designed to identify and count the rows where **2ndFlrSF** is missing in the dataset:

In [None]:
# Identifying and counting the  missing entries for the "2ndFlrSF" column in the cleaned DataFrame
missing_2nd_flr_sf = df_cleaned[df_cleaned["2ndFlrSF"].isnull()]

# Printing the number of rows with missing "2ndFlrSF" data
print(f"Amount of rows with missing data: {len(missing_2nd_flr_sf)}")

# Displaying rows with missing "2ndFlrSF"
missing_2nd_flr_sf

As indicated in our initial analysis, properties with no second floor can be assigned a value of zero for **2ndFlrSF**, as 53.5% of the dataset already has a value of zero for this variable.

Let's proceed to impute zero for the missing values in **2ndFlrSF**:

In [None]:
# Print the number of missing entries before imputation
initial_missing = df_cleaned["2ndFlrSF"].isnull().sum()
print(f'Amount of rows with missing "2ndFlrSF" data before imputation: {initial_missing}')

# Impute 0 for all missing values in "2ndFlrSF"
df_cleaned["2ndFlrSF"].fillna(0, inplace=True)

# Check and print the amount of missing data in "2ndFlrSF" after imputation to ensure no missing values remain
print(f'Amount of rows with missing "2ndFlrSF" data after imputation: {df_cleaned["2ndFlrSF"].isnull().sum()}')

### **GarageYrBlt**

**GarageYrBlt** refers to the year the garage was built.

The following code block is designed to identify and count the rows where **GarageYrBlt** is missing in the dataset:

In [None]:
# Identifying and counting the  missing entries for the "GarageYrBlt" column in the cleaned DataFrame
missing_garage_yr_built = df_cleaned[df_cleaned["GarageYrBlt"].isnull()]

# Printing the number of rows with missing "GarageYrBlt" data
print(f"Amount of rows with missing data: {len(missing_garage_yr_built)}")

# Displaying the selected columns for the filtered rows
missing_garage_yr_built[["GarageYrBlt", "GarageFinish", "GarageArea", "YearBuilt", "YearRemodAdd"]]

Next, we will check how many of the missing values in **GarageYrBlt** have a **GarageArea** equal to 0, which would indicate that the missing value is due to the absence of a garage:

In [None]:
# Create a DataFrame to identify rows where 'GarageYrBlt' is missing and 'GarageArea' is 0
df_garage_year_none = df_cleaned[(df_cleaned["GarageYrBlt"].isnull()) & (df_cleaned["GarageArea"] == 0)]

# Print the number of rows that match this condition
print(f"Amount of rows with missing data: {len(df_garage_year_none)}")

# Display the relevant columns for the identified rows
df_garage_year_none[["GarageYrBlt", "GarageFinish", "GarageArea", "YearBuilt", "YearRemodAdd"]]

All missing values in **GarageYrBlt** correspond to properties without a garage, as indicated by a zero in the GarageArea.

The **GarageYrBlt** variable has significant Power Predictive Scores (PPS) of 0.6 with **YearBuilt** and 0.4 with **YearRemodAdd**, strongly suggesting that garages are typically constructed at the same time as the main house or during major renovations. This substantial overlap implies that **GarageYrBlt** does not add unique value beyond what is already conveyed by **YearBuilt** or **YearRemodelAdd**.

Considering this redundancy, although we could impute a value of 0 for properties without garages to indicate no garage was built it is not ideal as it would set the Year the garage was built as Year 0. 
It is more prudent to remove **GarageYrBlt** from the dataset entirely.

In [None]:
# Drops the specified column from the cleaned dataframe
df_cleaned.drop("GarageYrBlt", axis=1, inplace=True)

We verify that the specified variable has been successfully removed from the dataframe with the code below:

In [None]:
for column in df_cleaned.columns:
    print(column)

# Print all variables with missing data to verify that the specified variables have been successfully removed
EvaluateMissingData(df_cleaned)

As illustrated, the variable **GarageYrBlt** no longer appears in the list of variables and in the list of variables with missing data, confirming the successful removal from the dataset.

### **MasVnrArea**

**MasVnrArea** stands for Masonry Veneer Area and measures the square footage of masonry veneer applied to the house.

The following code block is designed to identify and count the rows where **MasVnrArea** is missing in the dataset:

In [None]:
# Identifying and counting the  missing entries for the "MasVnrArea" column in the cleaned DataFrame
missing_mas_vnr_area = df_cleaned[df_cleaned["MasVnrArea"].isnull()]

# Printing the number of rows with missing "MasVnrArea" data
print(f"Amount of rows with missing data: {len(missing_mas_vnr_area)}")

# Displaying the filtered rows
missing_mas_vnr_area

The missing values for the **MasVnrArea** variable will be filled with zero, which is its median value. Imputing zero is meaningful in this context, as it indicates properties that do not have any masonry veneer area.

In [None]:
# Print the number of missing entries before imputation
initial_missing = df_cleaned["MasVnrArea"].isnull().sum()
print(f'Amount of rows with missing "MasVnrArea" data before imputation: {initial_missing}')

# Impute 0 for all missing values in "MasVnrArea"
df_cleaned["MasVnrArea"].fillna(0, inplace=True)

# Check and print the amount of missing data in "MasVnrArea" after imputation to ensure no missing values remain
print(f'Amount of rows with missing "MasVnrArea" data after imputation: {df_cleaned["MasVnrArea"].isnull().sum()}')

# Data Cleaning Summary

We will now use a custom function from Code Institute (CI) to compare the differences between the dataframe before and after cleaning.

In [None]:
import seaborn as sns
sns.set(style="whitegrid")
import matplotlib.pyplot as plt

def DataCleaningEffect(df_original,df_cleaned,variables_applied_with_method):

  flag_count=1 # Indicate plot number
  
  # distinguish between numerical and categorical variables
  categorical_variables = df_original.select_dtypes(exclude=["number"]).columns 

  # scan over variables, 
    # first on variables that you applied the method
    # if the variable is a numerical plot, a histogram if categorical plot a barplot
  for set_of_variables in [variables_applied_with_method]:
    print("\n=====================================================================================")
    print(f"* Distribution Effect Analysis After Data Cleaning Method in the following variables:")
    print(f"{set_of_variables} \n\n")
  

    for var in set_of_variables:
      if var in categorical_variables:  # it is categorical variable: barplot
        
        df1 = pd.DataFrame({"Type":"Original","Value":df_original[var]})
        df2 = pd.DataFrame({"Type":"Cleaned","Value":df_cleaned[var]})
        dfAux = pd.concat([df1, df2], axis=0)
        fig , axes = plt.subplots(figsize=(15, 5))
        sns.countplot(hue="Type", data=dfAux, x="Value",palette=["#432371","#FAAE7B"])
        axes.set(title=f"Distribution Plot {flag_count}: {var}")
        plt.xticks(rotation=90)
        plt.legend() 

      else: # it is numerical variable: histogram

        fig , axes = plt.subplots(figsize=(10, 5))
        sns.histplot(data=df_original, x=var, color="#432371", label="Original", kde=True,element="step", ax=axes)
        sns.histplot(data=df_cleaned, x=var, color="#FAAE7B", label="Cleaned", kde=True,element="step", ax=axes)
        axes.set(title=f"Distribution Plot {flag_count}: {var}")
        plt.legend() 

      plt.show()
      flag_count+= 1



In [None]:
# List of variables that we want to analyze to see the effect of data cleaning
variables_to_analyze = [
    "GarageFinish", "BsmtFinType1", "LotFrontage", "BedroomAbvGr", "2ndFlrSF", "MasVnrArea"
]

# "df_original" refers to the DataFrame before any cleaning was applied
# "df_cleaned" refers to the DataFrame after cleaning operations have been performed
# "variables_applied_with_method" is a list of variable names that we want to compare
# This function will display visual comparisons for each variable listed
DataCleaningEffect(df_original=df, df_cleaned=df_cleaned, variables_applied_with_method=variables_to_analyze)

Now, we will verify that there are no remaining missing data points in our cleaned dataframe.

In [None]:
EvaluateMissingData(df_cleaned)

Since there is no more missing data, we can progress to the next step of splitting the cleaned dataset into training and test sets.

# Push files to Repo

* If you do not need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [None]:
import os
try:
  # create here your folder
  # os.makedirs(name="")
except Exception as e:
  print(e)
