In [None]:
#!/usr/bin/env python
# coding: utf-8

# Exploratory Data Analysis for HDB Flat Resale Price Prediction<br>
<br>
This notebook contains comprehensive exploratory data analysis of the HDB flat resale price dataset.

Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import yaml

Set display options

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

Set style for plots

In [None]:
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)

## 1. Load and Examine Data

Load configuration

In [None]:
with open('src/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

Load data

In [None]:
df = pd.read_csv(config['data']['train_path'])

Display basic information

In [None]:
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())

In [None]:
print("\nData types:")
print(df.dtypes)

In [None]:
print("\nSummary statistics:")
print(df.describe())

## 2. Data Quality Assessment

Check for missing values

In [None]:
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

In [None]:
print("Missing values per column:")
for col, count, percentage in zip(df.columns, missing_values, missing_percentage):
    print(f"{col}: {count} ({percentage:.2f}%)")

Visualize missing values

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x=df.columns, y=missing_percentage)
plt.title('Missing Values Percentage by Column')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Percentage')
plt.tight_layout()
plt.show()

## 3. Target Variable Analysis

In [None]:
target_col = config['features']['target_column']

Distribution of target variable

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(df[target_col], kde=True)
plt.title(f'Distribution of {target_col}')
plt.xlabel(target_col)
plt.ylabel('Count')
plt.show()

Box plot to identify outliers

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(y=df[target_col])
plt.title(f'Box Plot of {target_col}')
plt.show()

Summary statistics of target variable

In [None]:
print(f"Summary statistics of {target_col}:")
print(df[target_col].describe())

## 4. Numerical Features Analysis

In [None]:
numerical_cols = config['features']['numerical_columns']

Correlation matrix

In [None]:
plt.figure(figsize=(12, 10))
correlation_matrix = df[numerical_cols + [target_col]].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

Pair plot for numerical features

In [None]:
sns.pairplot(df[numerical_cols + [target_col]], diag_kind='kde')
plt.suptitle('Pair Plot of Numerical Features', y=1.02)
plt.show()

## 5. Categorical Features Analysis

In [None]:
categorical_cols = config['features']['categorical_columns']

Count plots for categorical features

In [None]:
for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    value_counts = df[col].value_counts()
    
    # If too many categories, show top 10
    if len(value_counts) > 10:
        value_counts = value_counts.head(10)
    
    sns.barplot(x=value_counts.index, y=value_counts.values)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

Box plots for categorical features vs target

In [None]:
for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    
    # If too many categories, show top 10
    if df[col].nunique() > 10:
        top_categories = df[col].value_counts().head(10).index
        df_filtered = df[df[col].isin(top_categories)]
    else:
        df_filtered = df
    
    sns.boxplot(x=col, y=target_col, data=df_filtered)
    plt.title(f'{target_col} by {col}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 6. Feature Engineering Insights

Convert lease_commence_date to datetime

In [None]:
df['lease_commence_date'] = pd.to_datetime(df['lease_commence_date'])

Calculate age of the flat

In [None]:
current_year = pd.Timestamp.now().year
df['flat_age'] = current_year - df['lease_commence_date'].dt.year

Calculate price per square meter

In [None]:
df['price_per_sqm'] = df[target_col] / df['floor_area_sqm']

Plot flat age vs resale price

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='flat_age', y=target_col, data=df, alpha=0.5)
plt.title('Resale Price vs Flat Age')
plt.xlabel('Flat Age (years)')
plt.ylabel(target_col)
plt.show()

Plot price per square meter distribution

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(df['price_per_sqm'], kde=True)
plt.title('Distribution of Price per Square Meter')
plt.xlabel('Price per Square Meter')
plt.ylabel('Count')
plt.show()

## 7. Summary of Findings

### Key Insights:<br>
<br>
1. **Data Quality**:<br>
   - [To be filled after running the notebook]<br>
<br>
2. **Target Variable (Resale Price)**:<br>
   - [To be filled after running the notebook]<br>
<br>
3. **Feature Relationships**:<br>
   - [To be filled after running the notebook]<br>
<br>
4. **Feature Engineering Opportunities**:<br>
   - [To be filled after running the notebook]<br>
<br>
5. **Model Selection Considerations**:<br>
   - [To be filled after running the notebook] 