## Import Required Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import skew
warnings.filterwarnings("ignore")

## Loading the Dataset


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shashanknecrothapa/ames-housing-dataset", path="AmesHousing.csv")

# load as pandas dataframe
df = pd.read_csv(path)

In [None]:
# Dataset shape
df.shape

In [None]:
# Dataset sample
df.head()


In [None]:
# Columns
df.columns

## Exploratory Analysis Before Cleaning

In [None]:
# Info
df.info()

In [None]:
# Are any there missing values?
df.isna().any()

In [None]:
# How many missing values?
df.isna().sum()

In [None]:
# Sum of missing values per column
missing_values = df.isna().sum()

# Total number of rows
total_rows = len(df)

# Proportion of missing values per column
missing_value_proportion = missing_values / total_rows
print(missing_value_proportion)

## Visualizations

In [None]:
# Histogram showing the distribution of sale price values. 
# The density line (KDE) helps to understand the shape of the distribution.
plt.figure(figsize=(10, 5))
sns.histplot(df['SalePrice'], kde=True, bins=30)
plt.title('Distribution of Sale Price Values')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Boxplot provides insight into the distribution of sale price values
# highlighting the median, quartiles, and potential outliers.
plt.figure(figsize=(10, 3))
sns.boxplot(x=df['SalePrice'])
plt.title('Sale Price Boxplot')
plt.xlabel('Sale Prices')
plt.show()

In [None]:
# A line plot showing how average sale price have varied over time.
# helps identify trends or seasonal patterns in the data.
plt.figure(figsize=(12, 5))
grouped_by_year = df.groupby('Yr Sold', as_index=False).agg({'SalePrice': 'mean'})
sns.lineplot(x='Yr Sold', y='SalePrice', data=df)
plt.title('Sale Price Values Over Time')
plt.xlabel('Year Sold')
plt.ylabel('Sale Price')
plt.show()

In [None]:
# A bar chart showing the number of house sold by house style.
# This can be useful for understanding the prevalence of certain house style in transactions.
plt.figure(figsize=(10, 5))
sns.countplot(x='House Style', data=df)
plt.title('Count of House Sold by Style')
plt.xlabel('Style')
plt.ylabel('Count')
plt.show()

## Handling Missing Values

**1. Handling Missing Values for Numeric Variables**

For numeric variables, a common approach is to replace missing values with the mean or median of the column. The choice between mean or median typically depends on the data distribution.

In [None]:
# any missing lot frontage values?
df['Lot Frontage'].isna().sum()

In [None]:
# Distribution of frontage Values
plt.figure(figsize=(10, 5))
sns.histplot(df['Lot Frontage'], kde=True, bins=30)
plt.title('Distribution of Lot Frontage Values')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Lot Frontage values mean
df['Lot Frontage'].mean()

In [None]:
# Lot Frontage values median
df['Lot Frontage'].median()

In [19]:
# Replace missing values with the median
df['Lot Frontage'].fillna(df['Lot Frontage'].median(), inplace=True)

In [None]:
# any missing lot frontage values?
df['Lot Frontage'].isna().sum()

**2. Handling Missing Values for Categorical Variables**

For categorical variables, replace missing values with the most frequent value (mode) or categorize them as 'Unknown'.


In [None]:
# Any missing values in Fence?
df['Fence'].isna().sum()

In [None]:
# Calculate the mode
df['Fence'].mode()[0]

In [23]:
# Replace missing values in Fence with the mode
df['Fence'].fillna(df['Fence'].mode()[0], inplace=True)

In [None]:
# Any missing values in Fence?
df['Fence'].isna().sum()

In [None]:
# Any missing values in Misc Feature?
df['Misc Feature'].isna().sum()

In [26]:
# Filling missing values with the category 'Other'
df['Misc Feature'].fillna('Other', inplace=True)

In [None]:
# Any missing values in Misc Feature?
df['Misc Feature'].isna().sum()

In [None]:
# Any missing values in Alley?
df['Alley'].isna().sum()

In [29]:
# Filling missing values in with bfill (backward fill)
# This method fills each missing value with the next valid value in the same column
df['Alley'].fillna(method='bfill', inplace=True)

In [30]:
# Filling missing values in with bfill (backward fill)
# This method fills each missing by propagating the last valid value observation to next valid value
df['Alley'].fillna(method='bfill', inplace=True)

In [None]:
# Any missing values in Alley?
df['Alley'].isna().sum()

## Outlier Treatment


Why?
1. Outliers distort essential statistical metrics like the mean and standard deviation, leading to inaccurate summaries of the data.
2. In predictive modeling, outliers can influence model parameters, leading to poor generalization.
3. Outliers may result in a loss of valuable information


In [None]:
# Boxplot of Sale Price values
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['SalePrice'])
plt.title('Boxplot of Sale Price values')
plt.xlabel('Values')
plt.show()

In [33]:
# Outlier treatment for the Sale Price variable with IQR
# The IQR is the range between the 25th and 75th percentiles of the data.
# Data points beyond 1.5 times the IQR are considered outliers.

# Calculating Q1 and Q3
Q1 = df['SalePrice'].quantile(0.25)
Q3 = df['SalePrice'].quantile(0.75)

# Calculating IQR
IQR = Q3 - Q1

# Setting limits to identify outliers
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Filtering out the outliers
df_filtered_1 = df[~((df['SalePrice'] < lower_limit) | (df['SalePrice'] > upper_limit))]

In [None]:
# Boxplot of Sale Price Values after filtering outliers
plt.figure(figsize=(8, 5))
sns.boxplot(x=df_filtered_1['SalePrice'])
plt.title('Boxplot of Sale Price values')
plt.xlabel('Values')
plt.show()

In [35]:
# Outlier treatment for the Sale Price variable with Z-score
# The Z-score measures how many standard deviations a data point is from the mean.
# Data points with Z-scores above and bellow the threshold are flagged as outliers.
# Use when data follow a normal distribution

# Set threshold
threshold = 3

# Setting limits to identify outliers
upper_limit = df['SalePrice'].mean() + threshold*df['SalePrice'].std()
lower_limit = df['SalePrice'].mean() - threshold*df['SalePrice'].std()

# Filtering out the outliers
df_filtered_2 = df[~((df['SalePrice'] < lower_limit) | (df['SalePrice'] > upper_limit))]

In [None]:
# Boxplot of Sale Price Values after filtering outliers
plt.figure(figsize=(8, 5))
sns.boxplot(x=df_filtered_2['SalePrice'])
plt.title('Boxplot of Sale Price values')
plt.xlabel('Values')
plt.show()

## Exploratory Analysis After Cleaning

### Univariate Exploratory Analysis


In [None]:
# Dataset info after cleaning
df_filtered_1.info()

In [None]:
sns.set_theme(style="whitegrid")

plt.figure(figsize=(10, 6))
sns.histplot(df_filtered_1['SalePrice'], kde=True, bins=30)
plt.title('Distribution of Sale Price Values')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()

**Histogram Interpretation**:

Interpreting a histogram is essential for understanding the data distribution. A histogram is a chart that shows the frequency with which values in a dataset occur within specific intervals or 'bins'. Here are the steps to interpret it:

**1. Observe the Overall Shape**

The shape of the histogram can reveal a lot about the nature of the data.

Symmetrical Distribution: If the chart is symmetrical around a central value, it indicates a normal or nearly normal distribution.

Skewed Distribution: If the tail of the chart extends more to the left or right, it indicates skewness. Skewness to the left indicates negative skewness; skewness to the right indicates positive skewness.


In [None]:
# Calculating skewness
skewness = skew(df_filtered_1['SalePrice'])
print(f"The skewness of the distribution of values is: {skewness}")

- A skewness value of zero suggests that the distribution is perfectly symmetrical.
- A positive skewness value indicates a distribution with a heavier tail to the right.
- A negative skewness value indicates a distribution with a heavier tail to the left.


**2. Identify Peaks and Valleys**

Presence of multiple peaks: This may indicate that the data has multiple groups or modes.

Peaks (Modes): The highest points of the histogram are the modes, indicating the most common values in the data.
Valleys: These are the areas between peaks, which may indicate a separation between different groups in the data.

**3. Check the Bin Width**

The width of the intervals (bins) can affect the appearance of the histogram. Very wide bins can hide important details, while very narrow bins can show too much random variation.

**4. Observe Outliers**

Outliers may appear as isolated bars distant from the rest of the data, indicating atypical values.

**5. Analyze the X and Y Axes**

- X-axis (horizontal): Shows the value intervals of the data.
- Y-axis (vertical): Shows the frequency or count of occurrences in each bin.

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=df_filtered_1['SalePrice'])
plt.title('Boxplot of Sale Price Values')
plt.xlabel('Values')
plt.show()

**Boxplot Interpretation:**

Interpreting a boxplot, also known as a box-and-whisker plot, is essential to understand the data’s distribution, central tendency, and variability. The boxplot provides a concise visual representation that highlights the quartiles and outliers within the data. Here are the main components of a boxplot and how to interpret them:

**1. Quartiles**

The boxplot is divided into quartiles:

- **Q1 (First Quartile):** Represents the value below which 25% of the data lies.
- **Q2 (Median):** Divides the data in half, with 50% of values below and 50% above it.
- **Q3 (Third Quartile):** Represents the value below which 75% of the data lies.

**2. Interquartile Range (IQR)**

The **IQR** is the distance between Q1 and Q3. It represents the range of the central 50% of the data. A larger IQR indicates greater spread within this central portion.

**3. Whiskers**

The "whiskers" extend from Q1 and Q3 to show the variation outside the central 50%. Typically, they extend up to 1.5 * IQR above Q3 and below Q1.

Data points beyond the whiskers are considered potential outliers.

**4. Outliers**

Outliers are data points that fall outside the whiskers. These are often represented as individual dots or circles on the plot.

**5. Symmetry**

If the median is centered within the box and the whiskers are similar in length, the data is more symmetrical.

If the median is closer to Q1 or Q3, or if one whisker is significantly longer, the data is skewed.

A longer box indicates greater variability in the central data. The presence of outliers may suggest extreme variations or issues with the data.

### Multivariate Exploratory Analysis

### Analyzing and Interpreting Correlation Maps

In [None]:
# Display the first few rows of the filtered dataset
df_filtered_1.head()

In [None]:
# Display information about the filtered dataset
df_filtered_1.info()

In [None]:
# Calculating the correlation between SalePrice and Lot Area
correlation_matrix = df_filtered_1[['SalePrice', 'Lot Area']].corr()

# Creating the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

**The Correlation Map (or Heatmap)** is a visual tool used to represent the strength and direction of the correlation between two or more variables. Here’s how to interpret it:

**Values in the Matrix:**

Each cell in the matrix shows the correlation coefficient between two variables. The correlation coefficient ranges from -1 to 1.

- **1** indicates a perfect positive correlation (when one variable increases, the other also increases).
- **-1** indicates a perfect negative correlation (when one variable increases, the other decreases).
- **0** indicates no correlation (the variables have no linear relationship).

**Colors:**

The colors represent the strength of the correlation, usually following a gradient.

**Interpretation:**

- **Strong Positive Correlation (values close to 1):** When one variable increases, the other also tends to increase. For example, in sales data, there may be a strong positive correlation between advertising spending and revenue.
- **Strong Negative Correlation (values close to -1):** When one variable increases, the other tends to decrease. For example, in health data, there may be a strong negative correlation between exercise quantity and body weight.
- **Weak or No Correlation (values close to 0):** The variables have no clear linear relationship. One variable does not reliably predict the other.

**Considerations:**

Correlation does not imply causation. Even if two variables are strongly correlated, it does not mean one causes the other. Other factors may influence the variables. Therefore, more in-depth analyses, such as causality analysis, may be necessary to understand relationships between variables.

### Analyzing and Interpreting Scatter Plots

In [None]:
# Creating the scatter plot between SalePrice and Lot Area
plt.figure(figsize=(10, 6))
sns.scatterplot(x='SalePrice', y='Lot Area', data=df_filtered_1)
plt.title('SalePrice vs Lot Area')
plt.xlabel('Sale Price')
plt.ylabel('Lot Area')
plt.show()

**Overall Pattern and Direction of Relationship:**

- **Positive:** If the points tend to rise from left to right, this indicates a positive correlation; as one variable increases, the other also tends to increase.
- **Negative:** If the points tend to fall from left to right, this indicates a negative correlation; as one variable increases, the other tends to decrease.
- **No Trend:** If the points are randomly scattered without a clear direction, this suggests little or no linear correlation between the variables.

**Strength of Relationship:**

- **Strong:** Points close to an imaginary line indicate a strong linear relationship.
- **Weak:** Points widely scattered around the line indicate a weaker linear relationship.
- **Outliers:** Points that do not follow the general pattern may indicate outliers or anomalies, which may warrant further investigation.

**Homoscedasticity vs. Heteroscedasticity:**

- **Homoscedasticity:** The spread of points remains consistent along the axis.
- **Heteroscedasticity:** The spread of points changes, which may indicate variability differences of one variable in relation to the other.