# Task 1: Comprehensive Data Analysis Report
# Ames Housing Dataset - Exploratory Data Analysis

This notebook provides a complete data analysis report on the Ames Housing dataset with 79 explanatory variables.

## Table of Contents
1. Data Loading and Overview
2. Data Quality Assessment
3. Numerical Features Analysis
4. Categorical Features Analysis
5. Target Variable Analysis (SalePrice)
6. Correlation Analysis
7. Feature Relationships with Price
8. Key Insights and Findings

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Import custom modules
import sys
sys.path.append('../src')
from data_preprocessing import DataPreprocessor, get_feature_info

print("Libraries imported successfully!")

## 1. Data Loading and Overview

In [None]:
# Load the dataset
df = pd.read_csv('../data/train.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Basic information
print("Dataset Info:")
print("=" * 60)
df.info()

In [None]:
# Statistical summary
print("Statistical Summary of Numerical Features:")
df.describe()

## 2. Data Quality Assessment

In [None]:
# Missing values analysis
missing_data = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_data,
    'Percentage': missing_percent
})

missing_df = missing_df[missing_df['Missing_Count'] > 0]

print(f"Total features with missing values: {len(missing_df)}")
print("\nTop 20 features with missing values:")
missing_df.head(20)

In [None]:
# Visualize missing values
plt.figure(figsize=(12, 8))
missing_df.head(20)['Percentage'].plot(kind='barh')
plt.xlabel('Percentage of Missing Values')
plt.title('Top 20 Features with Missing Values')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Identify feature types
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print(f"Total Features: {df.shape[1]}")
print(f"Numerical Features: {len(numeric_features)}")
print(f"Categorical Features: {len(categorical_features)}")

## 3. Numerical Features Analysis

In [None]:
# Distribution of key numerical features
key_numeric = ['LotArea', 'GrLivArea', 'TotalBsmtSF', 'GarageArea', 'YearBuilt']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(key_numeric):
    if col in df.columns:
        axes[i].hist(df[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Box plots for outlier detection
key_numeric_subset = ['LotArea', 'GrLivArea', 'TotalBsmtSF']

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(key_numeric_subset):
    if col in df.columns:
        axes[i].boxplot(df[col].dropna())
        axes[i].set_title(f'Box Plot: {col}')
        axes[i].set_ylabel(col)
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Categorical Features Analysis

In [None]:
# Analyze key categorical features
key_categorical = ['Neighborhood', 'BldgType', 'HouseStyle', 'SaleCondition']

for cat in key_categorical:
    if cat in df.columns:
        print(f"\n{cat} - Value Counts:")
        print(df[cat].value_counts().head(10))

In [None]:
# Visualize categorical distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, cat in enumerate(key_categorical[:4]):
    if cat in df.columns:
        top_categories = df[cat].value_counts().head(10)
        axes[i].bar(range(len(top_categories)), top_categories.values)
        axes[i].set_xticks(range(len(top_categories)))
        axes[i].set_xticklabels(top_categories.index, rotation=45, ha='right')
        axes[i].set_title(f'Top 10 Categories in {cat}')
        axes[i].set_ylabel('Count')
        axes[i].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 5. Target Variable Analysis (SalePrice)

In [None]:
# Sale Price statistics
print("Sale Price Statistics:")
print("=" * 50)
print(f"Mean:     ${df['SalePrice'].mean():,.2f}")
print(f"Median:   ${df['SalePrice'].median():,.2f}")
print(f"Std Dev:  ${df['SalePrice'].std():,.2f}")
print(f"Min:      ${df['SalePrice'].min():,.2f}")
print(f"Max:      ${df['SalePrice'].max():,.2f}")
print(f"Range:    ${df['SalePrice'].max() - df['SalePrice'].min():,.2f}")

In [None]:
# Sale Price distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['SalePrice'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Sale Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Sale Prices')
axes[0].axvline(df['SalePrice'].mean(), color='red', linestyle='--', label='Mean')
axes[0].axvline(df['SalePrice'].median(), color='green', linestyle='--', label='Median')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Log transformation
axes[1].hist(np.log(df['SalePrice']), bins=50, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Log(Sale Price)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Log(Sale Price)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Check for normality
skewness = df['SalePrice'].skew()
kurtosis = df['SalePrice'].kurtosis()

print(f"Skewness: {skewness:.4f}")
print(f"Kurtosis: {kurtosis:.4f}")

# QQ Plot
plt.figure(figsize=(8, 6))
stats.probplot(df['SalePrice'], dist="norm", plot=plt)
plt.title('QQ Plot - Sale Price')
plt.grid(True, alpha=0.3)
plt.show()

## 6. Correlation Analysis

In [None]:
# Calculate correlation matrix for numerical features
numeric_df = df.select_dtypes(include=['int64', 'float64'])
correlation_matrix = numeric_df.corr()

# Top correlations with SalePrice
price_corr = correlation_matrix['SalePrice'].sort_values(ascending=False)
print("Top 15 Features Correlated with Sale Price:")
print("=" * 50)
print(price_corr.head(16))  # 16 to include SalePrice itself

In [None]:
# Visualize correlation heatmap (top features)
top_features = price_corr.head(11).index  # Top 10 + SalePrice
top_corr_matrix = df[top_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(top_corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Heatmap - Top 10 Features vs Sale Price')
plt.tight_layout()
plt.show()

## 7. Feature Relationships with Price

In [None]:
# Scatter plots for top numerical features
top_numeric_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF']

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for i, feat in enumerate(top_numeric_features):
    if feat in df.columns:
        axes[i].scatter(df[feat], df['SalePrice'], alpha=0.5, edgecolors='k', linewidth=0.5)
        axes[i].set_xlabel(feat)
        axes[i].set_ylabel('Sale Price ($)')
        axes[i].set_title(f'{feat} vs Sale Price')
        axes[i].grid(True, alpha=0.3)
        
        # Add correlation coefficient
        corr = df[[feat, 'SalePrice']].corr().iloc[0, 1]
        axes[i].text(0.05, 0.95, f'Correlation: {corr:.3f}',
                    transform=axes[i].transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

In [None]:
# Box plots for categorical features vs Price
categorical_for_analysis = ['OverallQual', 'Neighborhood']

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall Quality
if 'OverallQual' in df.columns:
    df.boxplot(column='SalePrice', by='OverallQual', ax=axes[0])
    axes[0].set_xlabel('Overall Quality')
    axes[0].set_ylabel('Sale Price ($)')
    axes[0].set_title('Sale Price by Overall Quality')
    plt.sca(axes[0])
    plt.xticks(rotation=0)

# Neighborhood (top 10)
if 'Neighborhood' in df.columns:
    top_neighborhoods = df['Neighborhood'].value_counts().head(10).index
    df[df['Neighborhood'].isin(top_neighborhoods)].boxplot(column='SalePrice', 
                                                            by='Neighborhood', ax=axes[1])
    axes[1].set_xlabel('Neighborhood')
    axes[1].set_ylabel('Sale Price ($)')
    axes[1].set_title('Sale Price by Neighborhood (Top 10)')
    plt.sca(axes[1])
    plt.xticks(rotation=45, ha='right')

plt.suptitle('')  # Remove auto-generated title
plt.tight_layout()
plt.show()

In [None]:
# Year Built vs Sale Price
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(df['YearBuilt'], df['SalePrice'], alpha=0.5, edgecolors='k', linewidth=0.5)
plt.xlabel('Year Built')
plt.ylabel('Sale Price ($)')
plt.title('Year Built vs Sale Price')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
year_price = df.groupby('YearBuilt')['SalePrice'].mean().sort_index()
plt.plot(year_price.index, year_price.values, marker='o', linewidth=2)
plt.xlabel('Year Built')
plt.ylabel('Average Sale Price ($)')
plt.title('Average Sale Price by Year Built')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Key Insights and Findings

In [None]:
print("KEY INSIGHTS FROM DATA ANALYSIS")
print("=" * 70)
print()

print("1. DATA QUALITY:")
print(f"   - Total features: {df.shape[1]}")
print(f"   - Features with missing values: {len(missing_df)}")
print(f"   - Most missing feature: {missing_df.index[0]} ({missing_df.iloc[0]['Percentage']:.1f}%)")
print()

print("2. TARGET VARIABLE (SalePrice):")
print(f"   - Mean price: ${df['SalePrice'].mean():,.2f}")
print(f"   - Median price: ${df['SalePrice'].median():,.2f}")
print(f"   - Price range: ${df['SalePrice'].min():,.2f} - ${df['SalePrice'].max():,.2f}")
print(f"   - Distribution: {'Right-skewed' if skewness > 0 else 'Left-skewed'} (skewness: {skewness:.2f})")
print()

print("3. TOP PREDICTIVE FEATURES (by correlation):")
top_5_features = price_corr[1:6]  # Exclude SalePrice itself
for i, (feature, corr) in enumerate(top_5_features.items(), 1):
    print(f"   {i}. {feature}: {corr:.3f}")
print()

print("4. FEATURE CATEGORIES:")
print(f"   - Numerical features: {len(numeric_features)}")
print(f"   - Categorical features: {len(categorical_features)}")
print()

print("5. DATA CHARACTERISTICS:")
print("   - Overall Quality is the strongest predictor")
print("   - Living Area (GrLivArea) shows strong positive correlation")
print("   - Garage and basement features are important")
print("   - Newer houses tend to have higher prices")
print("   - Neighborhood significantly affects price")
print()

print("6. RECOMMENDATIONS FOR MODELING:")
print("   - Handle missing values appropriately (many are 'None' indicators)")
print("   - Consider log transformation for SalePrice (reduces skewness)")
print("   - Engineer features from existing ones (e.g., total area, age)")
print("   - Use ensemble methods to capture complex relationships")
print("   - Pay special attention to quality, area, and location features")

## Summary

This comprehensive analysis has revealed:
- The dataset contains 79 features with varying levels of completeness
- Sale prices are right-skewed, suggesting log transformation may be beneficial
- Overall Quality, Living Area, Garage, and Basement features are strong predictors
- Both numerical and categorical features contribute to price variation
- Feature engineering and proper handling of missing values will be crucial for modeling