# China Real Estate Demand Prediction - EDA

This notebook explores the training data: distributions, missingness, correlations, and time series patterns.

- Load all CSVs from `data/raw/train`
- Parse `month` and `sector` into structured fields
- Visualize distributions and trends
- Identify leakage risks (no future info when predicting a month)



In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
import plotly.express as px

import sys
sys.path.append(str(Path('..').resolve().parent))

from src.data import DatasetPaths, load_all_training_tables, split_month_sector
from src.features import aggregate_monthly_totals

paths = DatasetPaths(root_dir=str(Path('..').resolve().parent))
train = load_all_training_tables(paths)

nht = train['new_house_transactions']
nht_aug = split_month_sector(nht)

print(nht_aug.head())



ModuleNotFoundError: No module named 'src'

: 

In [None]:
# Distributions of target and key numeric columns
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.histplot(nht_aug['amount_new_house_transactions'], bins=50, ax=axes[0])
axes[0].set_title('Distribution of amount_new_house_transactions')

sns.histplot(nht_aug['price_new_house_transactions'], bins=50, ax=axes[1])
axes[1].set_title('Distribution of price_new_house_transactions')
plt.tight_layout()
plt.show()

# Time series total across sectors
monthly_total = aggregate_monthly_totals(nht)
px.line(monthly_total, x='time', y='amount_new_house_transactions', title='Total amount per time').show()



In [None]:
# Missingness overview (top 20 by missing rate)
miss = nht_aug.isna().mean().sort_values(ascending=False)
print(miss.head(20))



In [None]:
# Correlation heatmap for numeric columns
num_cols = nht_aug.select_dtypes(include=['number']).columns
corr = nht_aug[num_cols].corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Correlation heatmap (new_house_transactions)')
plt.show()

