# House Price Prediction - Data Exploration

This notebook explores the Kaggle House Prices dataset and performs initial analysis.

**Dataset**: House Prices - Advanced Regression Techniques  
**Goal**: Predict house sale prices based on various features

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")

## 2. Load Data

In [None]:
# Load training data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nTotal features: {train_df.shape[1] - 1}")
print(f"Total samples: {train_df.shape[0]}")

## 3. Initial Data Inspection

In [None]:
# First few rows
train_df.head()

In [None]:
# Data types and missing values
train_df.info()

In [None]:
# Statistical summary
train_df.describe()

## 4. Target Variable Analysis (SalePrice)

In [None]:
# SalePrice statistics
print("SalePrice Statistics:")
print(train_df['SalePrice'].describe())
print(f"\nMedian Price: ${train_df['SalePrice'].median():,.0f}")
print(f"Mean Price: ${train_df['SalePrice'].mean():,.0f}")

In [None]:
# SalePrice distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(train_df['SalePrice'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Sale Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of House Prices')
axes[0].axvline(train_df['SalePrice'].mean(), color='red', linestyle='--', label='Mean')
axes[0].axvline(train_df['SalePrice'].median(), color='green', linestyle='--', label='Median')
axes[0].legend()

# Box plot
axes[1].boxplot(train_df['SalePrice'])
axes[1].set_ylabel('Sale Price ($)')
axes[1].set_title('Box Plot of House Prices')

plt.tight_layout()
plt.show()

## 5. Missing Values Analysis

In [None]:
# Calculate missing values
missing = train_df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_percent = (missing / len(train_df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_percent
})

print("Features with Missing Values:")
print(missing_df)
print(f"\nTotal features with missing values: {len(missing_df)}")

In [None]:
# Visualize missing values
if len(missing_df) > 0:
    plt.figure(figsize=(12, 6))
    missing_df['Percentage'].head(20).plot(kind='barh', color='coral')
    plt.xlabel('Percentage Missing (%)')
    plt.title('Top 20 Features with Missing Values')
    plt.tight_layout()
    plt.show()

## 6. Correlation Analysis

In [None]:
# Correlation with SalePrice
numerical_features = train_df.select_dtypes(include=[np.number]).columns
correlations = train_df[numerical_features].corr()['SalePrice'].sort_values(ascending=False)

print("Top 15 Features Correlated with SalePrice:")
print(correlations.head(15))

In [None]:
# Visualize top correlations
plt.figure(figsize=(10, 8))
top_corr = correlations.head(11)[1:]  # Exclude SalePrice itself
top_corr.plot(kind='barh', color='steelblue')
plt.xlabel('Correlation with SalePrice')
plt.title('Top 10 Features Correlated with House Price')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap for top features
top_features = correlations.head(11).index
plt.figure(figsize=(12, 10))
sns.heatmap(train_df[top_features].corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Heatmap - Top Features')
plt.tight_layout()
plt.show()

## 7. Feature Relationships with SalePrice

In [None]:
# Scatter plots for top numerical features
top_num_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF']

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for idx, feature in enumerate(top_num_features):
    axes[idx].scatter(train_df[feature], train_df['SalePrice'], alpha=0.5, color='steelblue')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('SalePrice')
    axes[idx].set_title(f'{feature} vs SalePrice')
    
    # Add trend line
    z = np.polyfit(train_df[feature].fillna(0), train_df['SalePrice'], 1)
    p = np.poly1d(z)
    axes[idx].plot(train_df[feature], p(train_df[feature]), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

## 8. Categorical Features Analysis

In [None]:
# Identify categorical features
categorical_features = train_df.select_dtypes(include=['object']).columns
print(f"Number of categorical features: {len(categorical_features)}")
print(f"\nCategorical features: {list(categorical_features)}")

In [None]:
# Analyze key categorical features
key_categorical = ['Neighborhood', 'BldgType', 'HouseStyle', 'SaleCondition']

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, feature in enumerate(key_categorical):
    if feature in train_df.columns:
        avg_price = train_df.groupby(feature)['SalePrice'].mean().sort_values(ascending=False)
        avg_price.plot(kind='barh', ax=axes[idx], color='teal')
        axes[idx].set_xlabel('Average Sale Price ($)')
        axes[idx].set_title(f'Average Price by {feature}')

plt.tight_layout()
plt.show()

## 9. Key Insights Summary

In [None]:
print("="*60)
print("KEY INSIGHTS FROM DATA EXPLORATION")
print("="*60)

print(f"\n1. Dataset Size:")
print(f"   - Training samples: {train_df.shape[0]}")
print(f"   - Features: {train_df.shape[1] - 1}")
print(f"   - Numerical features: {len(numerical_features)}")
print(f"   - Categorical features: {len(categorical_features)}")

print(f"\n2. Target Variable (SalePrice):")
print(f"   - Mean: ${train_df['SalePrice'].mean():,.0f}")
print(f"   - Median: ${train_df['SalePrice'].median():,.0f}")
print(f"   - Range: ${train_df['SalePrice'].min():,.0f} - ${train_df['SalePrice'].max():,.0f}")

print(f"\n3. Missing Values:")
print(f"   - Features with missing data: {len(missing_df)}")
print(f"   - Most missing: {missing_df.index[0] if len(missing_df) > 0 else 'None'} ({missing_df['Percentage'].iloc[0]:.1f}% if len(missing_df) > 0 else 0}%)")

print(f"\n4. Top Correlated Features:")
for i, (feature, corr) in enumerate(correlations.head(6)[1:].items(), 1):
    print(f"   {i}. {feature}: {corr:.3f}")

print(f"\n5. Next Steps:")
print(f"   ✓ Handle missing values")
print(f"   ✓ Encode categorical variables")
print(f"   ✓ Scale numerical features")
print(f"   ✓ Engineer new features")
print(f"   ✓ Train machine learning models")

print("\n" + "="*60)

## 10. Conclusion

This exploratory analysis revealed:

1. **Data Quality**: The dataset has some missing values that need to be handled
2. **Strong Predictors**: Features like OverallQual, GrLivArea, and GarageCars show strong correlation with price
3. **Feature Types**: Mix of numerical and categorical features requiring different preprocessing
4. **Target Distribution**: SalePrice shows right-skewed distribution

**Next**: Proceed to model training with Ridge Regression and Random Forest algorithms.