## Import Required Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Loading the Dataset


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shashanknecrothapa/ames-housing-dataset", path="AmesHousing.csv")

# load as pandas dataframe
df = pd.read_csv(path)

In [None]:
# Dataset shape
df.shape

In [None]:
# Dataset sample
df.head()


In [None]:
# Columns
df.columns

## Exploratory Analysis Before Cleaning

In [None]:
# Info
df.info()

In [None]:
# Are any there missing values?
df.isna().any()

In [None]:
# How many missing values?
df.isna().sum()

In [None]:
# Sum of missing values per column
missing_values = df.isna().sum()

# Total number of rows
total_rows = len(df)

# Proportion of missing values per column
missing_value_proportion = missing_values / total_rows
print(missing_value_proportion)

## Visualizations

In [None]:
# Histogram showing the distribution of sale price values. 
# The density line (KDE) helps to understand the shape of the distribution.
plt.figure(figsize=(10, 5))
sns.histplot(df['SalePrice'], kde=True, bins=30)
plt.title('Distribution of Sale Price Values')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Boxplot provides insight into the distribution of sale price values
# highlighting the median, quartiles, and potential outliers.
plt.figure(figsize=(10, 3))
sns.boxplot(x=df['SalePrice'])
plt.title('Sale Price Boxplot')
plt.xlabel('Sale Prices')
plt.show()

In [None]:
# A line plot showing how average sale price have varied over time.
# helps identify trends or seasonal patterns in the data.
plt.figure(figsize=(12, 5))
grouped_by_year = df.groupby('Yr Sold', as_index=False).agg({'SalePrice': 'mean'})
sns.lineplot(x='Yr Sold', y='SalePrice', data=df)
plt.title('Sale Price Values Over Time')
plt.xlabel('Year Sold')
plt.ylabel('Sale Price')
plt.show()

In [None]:
# A bar chart showing the number of house sold by house style.
# This can be useful for understanding the prevalence of certain house style in transactions.
plt.figure(figsize=(10, 5))
sns.countplot(x='House Style', data=df)
plt.title('Count of House Sold by Style')
plt.xlabel('Style')
plt.ylabel('Count')
plt.show()