## Import Required Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Loading the Dataset


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shashanknecrothapa/ames-housing-dataset", path="AmesHousing.csv")

# load as pandas dataframe
df = pd.read_csv(path)

In [None]:
# Dataset shape
df.shape

In [None]:
# Dataset sample
df.head()


In [None]:
# Columns
df.columns

## Exploratory Analysis Before Cleaning

In [None]:
# Info
df.info()

In [None]:
# Are any there missing values?
df.isna().any()

In [None]:
# How many missing values?
df.isna().sum()

In [None]:
# Sum of missing values per column
missing_values = df.isna().sum()

# Total number of rows
total_rows = len(df)

# Proportion of missing values per column
missing_value_proportion = missing_values / total_rows
print(missing_value_proportion)

## Visualizations

In [None]:
# Histogram showing the distribution of sale price values. 
# The density line (KDE) helps to understand the shape of the distribution.
plt.figure(figsize=(10, 5))
sns.histplot(df['SalePrice'], kde=True, bins=30)
plt.title('Distribution of Sale Price Values')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Boxplot provides insight into the distribution of sale price values
# highlighting the median, quartiles, and potential outliers.
plt.figure(figsize=(10, 3))
sns.boxplot(x=df['SalePrice'])
plt.title('Sale Price Boxplot')
plt.xlabel('Sale Prices')
plt.show()

In [None]:
# A line plot showing how average sale price have varied over time.
# helps identify trends or seasonal patterns in the data.
plt.figure(figsize=(12, 5))
grouped_by_year = df.groupby('Yr Sold', as_index=False).agg({'SalePrice': 'mean'})
sns.lineplot(x='Yr Sold', y='SalePrice', data=df)
plt.title('Sale Price Values Over Time')
plt.xlabel('Year Sold')
plt.ylabel('Sale Price')
plt.show()

In [None]:
# A bar chart showing the number of house sold by house style.
# This can be useful for understanding the prevalence of certain house style in transactions.
plt.figure(figsize=(10, 5))
sns.countplot(x='House Style', data=df)
plt.title('Count of House Sold by Style')
plt.xlabel('Style')
plt.ylabel('Count')
plt.show()

## Handling Missing Values

**1. Handling Missing Values for Numeric Variables**

For numeric variables, a common approach is to replace missing values with the mean or median of the column. The choice between mean or median typically depends on the data distribution.

In [None]:
# any missing lot frontage values?
df['Lot Frontage'].isna().sum()

In [None]:
# Distribution of frontage Values
plt.figure(figsize=(10, 5))
sns.histplot(df['Lot Frontage'], kde=True, bins=30)
plt.title('Distribution of Lot Frontage Values')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Lot Frontage values mean
df['Lot Frontage'].mean()

In [None]:
# Lot Frontage values median
df['Lot Frontage'].median()

In [19]:
# Replace missing values with the median
df['Lot Frontage'].fillna(df['Lot Frontage'].median(), inplace=True)

In [None]:
# any missing lot frontage values?
df['Lot Frontage'].isna().sum()

**2. Handling Missing Values for Categorical Variables**

For categorical variables, replace missing values with the most frequent value (mode) or categorize them as 'Unknown'.


In [None]:
# Any missing values in Fence?
df['Fence'].isna().sum()

In [None]:
# Calculate the mode
df['Fence'].mode()[0]

In [23]:
# Replace missing values in Fence with the mode
df['Fence'].fillna(df['Fence'].mode()[0], inplace=True)

In [None]:
# Any missing values in Fence?
df['Fence'].isna().sum()

In [None]:
# Any missing values in Misc Feature?
df['Misc Feature'].isna().sum()

In [26]:
# Filling missing values with the category 'Other'
df['Misc Feature'].fillna('Other', inplace=True)

In [None]:
# Any missing values in Misc Feature?
df['Misc Feature'].isna().sum()

In [None]:
# Any missing values in Alley?
df['Alley'].isna().sum()

In [29]:
# Filling missing values in with bfill (backward fill)
# This method fills each missing value with the next valid value in the same column
df['Alley'].fillna(method='bfill', inplace=True)

In [30]:
# Filling missing values in with bfill (backward fill)
# This method fills each missing by propagating the last valid value observation to next valid value
df['Alley'].fillna(method='bfill', inplace=True)

In [None]:
# Any missing values in Alley?
df['Alley'].isna().sum()

## Outlier Treatment


Why?
1. Outliers distort essential statistical metrics like the mean and standard deviation, leading to inaccurate summaries of the data.
2. In predictive modeling, outliers can influence model parameters, leading to poor generalization.
3. Outliers may result in a loss of valuable information


In [None]:
# Boxplot of Sale Price values
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['SalePrice'])
plt.title('Boxplot of Sale Price values')
plt.xlabel('Values')
plt.show()

In [33]:
# Outlier treatment for the Sale Price variable with IQR
# The IQR is the range between the 25th and 75th percentiles of the data.
# Data points beyond 1.5 times the IQR are considered outliers.

# Calculating Q1 and Q3
Q1 = df['SalePrice'].quantile(0.25)
Q3 = df['SalePrice'].quantile(0.75)

# Calculating IQR
IQR = Q3 - Q1

# Setting limits to identify outliers
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Filtering out the outliers
df_filtered_1 = df[~((df['SalePrice'] < lower_limit) | (df['SalePrice'] > upper_limit))]

In [None]:
# Boxplot of Sale Price Values after filtering outliers
plt.figure(figsize=(8, 5))
sns.boxplot(x=df_filtered_1['SalePrice'])
plt.title('Boxplot of Sale Price values')
plt.xlabel('Values')
plt.show()

In [37]:
# Outlier treatment for the Sale Price variable with Z-score
# The Z-score measures how many standard deviations a data point is from the mean.
# Data points with Z-scores above and bellow the threshold are flagged as outliers.
# Use when data follow a normal distribution

# Set threshold
threshold = 3

# Setting limits to identify outliers
upper_limit = df['SalePrice'].mean() + threshold*df['SalePrice'].std()
lower_limit = df['SalePrice'].mean() - threshold*df['SalePrice'].std()

# Filtering out the outliers
df_filtered_2 = df[~((df['SalePrice'] < lower_limit) | (df['SalePrice'] > upper_limit))]

In [None]:
# Boxplot of Sale Price Values after filtering outliers
plt.figure(figsize=(8, 5))
sns.boxplot(x=df_filtered_2['SalePrice'])
plt.title('Boxplot of Sale Price values')
plt.xlabel('Values')
plt.show()