In [12]:
import pandas as pd
import numpy as np

# Load datasets
train_df = pd.read_csv('D:/HousePricePredict/data/raw/train.csv')
test_df = pd.read_csv('D:/HousePricePredict/data/raw/test.csv')

# Fill 'None' for categorical features with lots of missing values (applies to BOTH train and test)
fill_none_cols = ['PoolQC', 'Alley', 'Fence', 'MiscFeature']
for col in fill_none_cols:
    train_df[col] = train_df[col].fillna('None')
    test_df[col] = test_df[col].fillna('None')
   
# Fill numerical missing values ('LotFrontage') with median (BOTH train and test)
for df in [train_df, test_df]:
    df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())

# Fill other categorical missing values where 'None' is meaningful (BOTH train and test)
cat_fill_none = ['GarageType']
for col in cat_fill_none:
    train_df[col] = train_df[col].fillna('None')
    test_df[col] = test_df[col].fillna('None')

# Convert MSSubClass to string (BOTH train and test)
train_df['MSSubClass'] = train_df['MSSubClass'].astype(str)
test_df['MSSubClass'] = test_df['MSSubClass'].astype(str)

# Remove duplicates
train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

# Save cleaned files
train_df.to_csv('D:/HousePricePredict/data/processed/train_clean.csv', index=False)
test_df.to_csv('D:/HousePricePredict/data/processed/test_clean.csv', index=False)
