In [None]:
# %% [markdown]
# # House Price Prediction - Data Preprocessing
# ## Task 1: Data Preprocessing for Machine Learning

# %% [markdown]
# ### Import Required Libraries

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# %% [markdown]
# ### 1. Load and Explore the Dataset

# %%
# Load the dataset
df = pd.read_csv('../data/house_prediction.csv', header=None, delim_whitespace=True)

# Add column names (based on Boston Housing dataset)
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 
                'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df.columns = column_names

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())

# %% [markdown]
# ### 2. Check for Missing Data

# %%
# Check for missing values
missing_data = df.isnull().sum()
print("Missing values in each column:")
print(missing_data[missing_data > 0] if any(missing_data > 0) else "No missing values found!")

# Visualize missing data
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

# %% [markdown]
# ### 3. Handle Missing Data (if any)

# %%
# Strategy 1: Fill with mean/median
df_mean_filled = df.copy()
for col in df_mean_filled.columns:
    if df_mean_filled[col].isnull().any():
        df_mean_filled[col].fillna(df_mean_filled[col].mean(), inplace=True)
        print(f"Filled {col} with mean: {df_mean_filled[col].mean():.2f}")

# Strategy 2: Drop rows with missing values
df_dropped = df.dropna()

# Compare shapes
print(f"\nOriginal shape: {df.shape}")
print(f"After dropping missing values: {df_dropped.shape}")
print(f"After filling with mean: {df_mean_filled.shape}")

# %% [markdown]
# ### 4. Check for Categorical Variables

# %%
# Check data types
print("Data types:")
print(df.dtypes)

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns: {categorical_cols if categorical_cols else 'None found'}")

# Check unique values in each column
print("\nUnique values in each column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# For this dataset, CHAS and RAD might be categorical
categorical_cols_to_encode = ['CHAS', 'RAD']
print(f"\nColumns to encode: {categorical_cols_to_encode}")

# %% [markdown]
# ### 5. Encode Categorical Variables

# %%
# Create a copy for encoding
df_encoded = df.copy()

# Method 1: One-Hot Encoding
df_one_hot = pd.get_dummies(df_encoded, columns=['CHAS', 'RAD'], prefix=['CHAS', 'RAD'])
print("After One-Hot Encoding:")
print(f"Shape: {df_one_hot.shape}")
print(df_one_hot.head())

# Method 2: Label Encoding (for ordinal data)
df_label_encoded = df_encoded.copy()
label_encoder = LabelEncoder()
df_label_encoded['CHAS'] = label_encoder.fit_transform(df_label_encoded['CHAS'])
df_label_encoded['RAD'] = label_encoder.fit_transform(df_label_encoded['RAD'])
print("\nAfter Label Encoding:")
print(df_label_encoded[['CHAS', 'RAD']].head())

# %% [markdown]
# ### 6. Check for Outliers

# %%
# Visualize distributions and outliers
fig, axes = plt.subplots(3, 5, figsize=(20, 12))
axes = axes.ravel()

for idx, col in enumerate(df_encoded.columns):
    if idx < len(axes):
        axes[idx].boxplot(df_encoded[col])
        axes[idx].set_title(col)
        axes[idx].set_ylabel('Value')

plt.tight_layout()
plt.show()

# Use IQR method to detect outliers
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

print("Outlier Detection using IQR method:")
print("-" * 50)
for col in df_encoded.columns:
    outlier_count, lb, ub = detect_outliers_iqr(df_encoded, col)
    print(f"{col}: {outlier_count} outliers (bounds: {lb:.2f} - {ub:.2f})")

# %% [markdown]
# ### 7. Normalize/Standardize Numerical Features

# %%
# Separate features and target
X = df_encoded.drop('MEDV', axis=1)
y = df_encoded['MEDV']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Method 1: Standardization (Z-score normalization)
scaler_standard = StandardScaler()
X_standardized = scaler_standard.fit_transform(X)
X_standardized = pd.DataFrame(X_standardized, columns=X.columns)

print("\nAfter Standardization (mean=0, std=1):")
print("Mean values:")
print(X_standardized.mean().round(2))
print("\nStd values:")
print(X_standardized.std().round(2))

# Method 2: Min-Max Normalization
scaler_minmax = MinMaxScaler()
X_normalized = scaler_minmax.fit_transform(X)
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)

print("\nAfter Min-Max Normalization (range 0-1):")
print("Min values:")
print(X_normalized.min())
print("\nMax values:")
print(X_normalized.max())

# %% [markdown]
# ### 8. Split Dataset into Training and Testing Sets

# %%
# Split the data (using standardized features)
X_train, X_test, y_train, y_test = train_test_split(
    X_standardized, y, test_size=0.2, random_state=42, shuffle=True
)

print("Dataset Split:")
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"Training target size: {y_train.shape}")
print(f"Testing target size: {y_test.shape}")

# Check distribution of target in train and test
print("\nTarget variable statistics:")
print("Training set - MEDV:")
print(y_train.describe())
print("\nTesting set - MEDV:")
print(y_test.describe())

# %% [markdown]
# ### 9. Save Processed Data

# %%
# Save the processed datasets
X_train.to_csv('../output/X_train.csv', index=False)
X_test.to_csv('../output/X_test.csv', index=False)
y_train.to_csv('../output/y_train.csv', index=False)
y_test.to_csv('../output/y_test.csv', index=False)

# Save the full processed dataset
full_processed = pd.concat([X_standardized, y], axis=1)
full_processed.to_csv('../output/house_prices_processed.csv', index=False)

print("All processed data saved successfully!")

# %% [markdown]
# ### 10. Summary Report

# %%
# Create a preprocessing summary
summary = {
    'Original Shape': df.shape,
    'Processed Shape': full_processed.shape,
    'Features': list(X.columns),
    'Target': 'MEDV',
    'Missing Values Handled': 'No missing values found' if missing_data.sum() == 0 else 'Filled with mean',
    'Categorical Encoding': 'One-Hot Encoding used',
    'Scaling Method': 'Standardization (Z-score)',
    'Train-Test Split': f'80-20 split',
    'Training Samples': len(X_train),
    'Testing Samples': len(X_test)
}

print("\n" + "="*50)
print("PREPROCESSING SUMMARY")
print("="*50)
for key, value in summary.items():
    print(f"{key}: {value}")