# House Price Prediction - Team Sindorai (EM06)

This notebook implements a machine learning regression model to predict house prices based on various features.

## Team Information
- **Team Name:** Sindorai
- **Team Code:** EM06
- **Task:** Task 2 - House Price Prediction (Regression)

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')
sns.set_palette("husl")

## 2. Load and Explore the Dataset

In [None]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Training dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
print(f"\nTraining dataset columns: {list(train_df.columns)}")
print(f"\nTest dataset columns: {list(test_df.columns)}")

In [None]:
# Display basic information about the training dataset
print("Training Dataset Info:")
print(train_df.info())
print("\nFirst 5 rows:")
train_df.head()

In [None]:
# Statistical summary of the training dataset
print("Statistical Summary:")
train_df.describe()

In [None]:
# Check for missing values
print("Missing values in training dataset:")
missing_train = train_df.isnull().sum()
print(missing_train[missing_train > 0])

print("\nMissing values in test dataset:")
missing_test = test_df.isnull().sum()
print(missing_test[missing_test > 0])

## 3. Data Exploration and Visualization

In [None]:
# Distribution of target variable
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(train_df['target_price'], bins=50, alpha=0.7)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(np.log(train_df['target_price']), bins=50, alpha=0.7)
plt.title('Distribution of Log House Prices')
plt.xlabel('Log Price')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Price statistics:")
print(f"Mean: ${train_df['target_price'].mean():,.2f}")
print(f"Median: ${train_df['target_price'].median():,.2f}")
print(f"Min: ${train_df['target_price'].min():,.2f}")
print(f"Max: ${train_df['target_price'].max():,.2f}")

In [None]:
# Correlation matrix
numeric_columns = train_df.select_dtypes(include=[np.number]).columns
correlation_matrix = train_df[numeric_columns].corr()

plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.show()

# Show features most correlated with target price
price_corr = correlation_matrix['target_price'].abs().sort_values(ascending=False)
print("Features most correlated with price:")
print(price_corr.head(10))

## 4. Data Preprocessing and Feature Engineering

In [None]:
# Create a copy for preprocessing
train_processed = train_df.copy()
test_processed = test_df.copy()

# Handle missing values in renovated_year (treat 0 as "never renovated")
# We can create a binary feature for whether the house was renovated
train_processed['is_renovated'] = (train_processed['renovated_year'] > 0).astype(int)
test_processed['is_renovated'] = (test_processed['renovated_year'] > 0).astype(int)

# Fill missing renovated_year with 0
train_processed['renovated_year'].fillna(0, inplace=True)
test_processed['renovated_year'].fillna(0, inplace=True)

# Create age feature
current_year = 2025
train_processed['house_age'] = current_year - train_processed['built_year']
test_processed['house_age'] = current_year - test_processed['built_year']

# Create years since renovation feature
train_processed['years_since_renovation'] = np.where(
    train_processed['renovated_year'] > 0,
    current_year - train_processed['renovated_year'],
    train_processed['house_age']
)
test_processed['years_since_renovation'] = np.where(
    test_processed['renovated_year'] > 0,
    current_year - test_processed['renovated_year'],
    test_processed['house_age']
)

# Create price per square foot features using neighbor data
train_processed['price_per_sqft'] = train_processed['target_price'] / train_processed['living_area']
# For test set, we'll estimate this later

print("Feature engineering completed.")
print(f"New features created: {['is_renovated', 'house_age', 'years_since_renovation']}")

In [None]:
# Handle sale_date - extract useful temporal features
train_processed['sale_date'] = pd.to_datetime(train_processed['sale_date'])
test_processed['sale_date'] = pd.to_datetime(test_processed['sale_date'])

train_processed['sale_year'] = train_processed['sale_date'].dt.year
train_processed['sale_month'] = train_processed['sale_date'].dt.month
train_processed['sale_quarter'] = train_processed['sale_date'].dt.quarter

test_processed['sale_year'] = test_processed['sale_date'].dt.year
test_processed['sale_month'] = test_processed['sale_date'].dt.month
test_processed['sale_quarter'] = test_processed['sale_date'].dt.quarter

# Drop the original sale_date column as we've extracted the useful parts
train_processed.drop('sale_date', axis=1, inplace=True)
test_processed.drop('sale_date', axis=1, inplace=True)

print("Temporal features extracted from sale_date.")

In [None]:
# Define features for modeling (exclude target and ID)
feature_columns = [col for col in train_processed.columns if col not in ['house_id', 'target_price', 'price_per_sqft']]

X = train_processed[feature_columns]
y = train_processed['target_price']
X_test = test_processed[feature_columns]

print(f"Features for modeling: {feature_columns}")
print(f"Training features shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Target shape: {y.shape}")

## 5. Model Building and Training

In [None]:
# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0, random_state=42),
    'Lasso Regression': Lasso(alpha=1.0, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # Calculate metrics
    train_rmse, train_mae, train_r2 = calculate_metrics(y_train, y_train_pred)
    val_rmse, val_mae, val_r2 = calculate_metrics(y_val, y_val_pred)
    
    model_results[name] = {
        'model': model,
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_r2': train_r2,
        'val_rmse': val_rmse,
        'val_mae': val_mae,
        'val_r2': val_r2
    }
    
    print(f"  Training RMSE: {train_rmse:,.2f}")
    print(f"  Validation RMSE: {val_rmse:,.2f}")
    print(f"  Training MAE: {train_mae:,.2f}")
    print(f"  Validation MAE: {val_mae:,.2f}")
    print(f"  Training R²: {train_r2:.4f}")
    print(f"  Validation R²: {val_r2:.4f}")
    print("-" * 50)

In [None]:
# Create results DataFrame for comparison
results_df = pd.DataFrame({
    'Model': list(model_results.keys()),
    'Train_RMSE': [model_results[name]['train_rmse'] for name in model_results.keys()],
    'Val_RMSE': [model_results[name]['val_rmse'] for name in model_results.keys()],
    'Train_MAE': [model_results[name]['train_mae'] for name in model_results.keys()],
    'Val_MAE': [model_results[name]['val_mae'] for name in model_results.keys()],
    'Train_R2': [model_results[name]['train_r2'] for name in model_results.keys()],
    'Val_R2': [model_results[name]['val_r2'] for name in model_results.keys()]
})

# Sort by validation RMSE (lower is better)
results_df = results_df.sort_values('Val_RMSE')
print("Model Comparison Results:")
print(results_df)

## 6. Model Hyperparameter Tuning

In [None]:
# Get the best performing model for tuning
best_model_name = results_df.iloc[0]['Model']
print(f"Best performing model: {best_model_name}")

# Hyperparameter tuning for the best model
if 'Random Forest' in best_model_name:
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    base_model = RandomForestRegressor(random_state=42)
elif 'Gradient Boosting' in best_model_name:
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.05, 0.1, 0.15],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0]
    }
    base_model = GradientBoostingRegressor(random_state=42)
else:
    # For other models, use the original
    base_model = model_results[best_model_name]['model']
    param_grid = None

if param_grid:
    print(f"Performing hyperparameter tuning for {best_model_name}...")
    grid_search = GridSearchCV(
        base_model, 
        param_grid, 
        cv=5, 
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
else:
    best_model = base_model
    print(f"Using original {best_model_name} model.")

## 7. Final Model Evaluation

In [None]:
# Train the final model on the full training set
print("Training final model on complete training dataset...")
final_model = best_model
final_model.fit(X, y)

# Make predictions on the full training set for evaluation
y_train_final_pred = final_model.predict(X)

# Calculate final metrics
final_rmse, final_mae, final_r2 = calculate_metrics(y, y_train_final_pred)

print("\n" + "="*60)
print(f"FINAL MODEL PERFORMANCE: {type(final_model).__name__}")
print("="*60)
print(f"RMSE (Root Mean Squared Error): {final_rmse:,.2f}")
print(f"MAE (Mean Absolute Error): {final_mae:,.2f}")
print(f"R² Score: {final_r2:.4f}")
print("="*60)

# Store metrics for screenshot
final_metrics = {
    'RMSE': final_rmse,
    'MAE': final_mae,
    'R2': final_r2
}

In [None]:
# Visualize actual vs predicted prices
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y, y_train_final_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')

plt.subplot(1, 2, 2)
residuals = y - y_train_final_pred
plt.scatter(y_train_final_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.title('Residual Plot')

plt.tight_layout()
plt.show()

## 8. Feature Importance Analysis

In [None]:
# Feature importance (if the model supports it)
if hasattr(final_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
    plt.title('Top 15 Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))
else:
    print("Feature importance not available for this model type.")

## 9. Generate Predictions for Test Dataset

In [None]:
# Generate predictions for the test dataset
print("Generating predictions for test dataset...")
test_predictions = final_model.predict(X_test)

# Create submission dataframe
submission_df = pd.DataFrame({
    'house_id': test_processed['house_id'],
    'predicted_price': test_predictions
})

print(f"Generated {len(submission_df)} predictions.")
print("\nFirst 10 predictions:")
print(submission_df.head(10))

print(f"\nPrediction statistics:")
print(f"Mean predicted price: ${submission_df['predicted_price'].mean():,.2f}")
print(f"Median predicted price: ${submission_df['predicted_price'].median():,.2f}")
print(f"Min predicted price: ${submission_df['predicted_price'].min():,.2f}")
print(f"Max predicted price: ${submission_df['predicted_price'].max():,.2f}")

In [None]:
# Save the predictions to CSV file
submission_filename = 'EM06_Sindorai_Task2_Predictions.csv'
submission_df.to_csv(submission_filename, index=False)
print(f"Predictions saved to {submission_filename}")

# Verify the file was saved correctly
verification_df = pd.read_csv(submission_filename)
print(f"\nVerification - File contains {len(verification_df)} rows")
print(f"Columns: {list(verification_df.columns)}")

## 10. Summary and Conclusions

In [None]:
print("\n" + "="*80)
print("HOUSE PRICE PREDICTION MODEL SUMMARY")
print("="*80)
print(f"Team: Sindorai (EM06)")
print(f"Final Model: {type(final_model).__name__}")
print(f"Training Dataset Size: {len(train_df)} houses")
print(f"Test Dataset Size: {len(test_df)} houses")
print(f"Number of Features: {len(feature_columns)}")
print("\nFINAL PERFORMANCE METRICS:")
print(f"  RMSE: {final_metrics['RMSE']:,.2f}")
print(f"  MAE:  {final_metrics['MAE']:,.2f}")
print(f"  R²:   {final_metrics['R2']:.4f}")
print("\nOUTPUT FILES GENERATED:")
print(f"  1. Notebook: EM06_Sindorai_Task2_HousePrice.ipynb")
print(f"  2. Predictions: EM06_Sindorai_Task2_Predictions.csv")
print(f"  3. Metrics Screenshot: EM06_Sindorai_Task2_Metrics.png (to be captured)")
print("="*80)