In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [None]:
# Load the Ames Housing dataset
df = pd.read_csv('../extracted_data/AmesHousing.csv')
df.shape, df.head()


In [None]:
# Basic data types and info
df.info()
df.describe(include='all').T


In [None]:
# Visualize missing values
msno.matrix(df)
plt.title('Missing Value Matrix')
plt.show()

# Percentage of missing values per column
missing_pct = df.isnull().mean() * 100
missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)
missing_pct


In [None]:
# Histograms for numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns
df[numerical_cols].hist(figsize=(16, 12), bins=30)
plt.tight_layout()
plt.show()


In [None]:
# Count plots for top categorical features
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols[:5]:
    plt.figure(figsize=(10, 4))
    sns.countplot(y=df[col], order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Scatter plot: Gr Liv Area vs SalePrice
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Gr Liv Area', y='SalePrice', data=df)
plt.title('Gr Liv Area vs SalePrice')
plt.show()


In [None]:
# Boxplot of Overall Qual vs SalePrice
plt.figure(figsize=(10, 6))
sns.boxplot(x='Overall Qual', y='SalePrice', data=df)
plt.title('Overall Qual vs SalePrice')
plt.show()


In [None]:
# Pairplot of selected features
selected_features = ['SalePrice', 'Gr Liv Area', 'Overall Qual', 'Total Bsmt SF', 'Year Built']
sns.pairplot(df[selected_features])
plt.suptitle('Multivariate Pairplot', y=1.02)
plt.show()


## Summary Insights
- Higher `Overall Qual` and `Gr Liv Area` are positively correlated with `SalePrice`.
- There are some missing values that might need imputation.
- `Year Built` and `Total Bsmt SF` also influence the final price.
- Categorical features like `Neighborhood` can be further analyzed for grouping impact.
