In [None]:
# Mumbai House Price Prediction - Model Training
# Save this as: mumbai_house_price_model.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load the Housing dataset
# Note: Using California housing data as a substitute for Mumbai housing data

from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()

# Create a DataFrame
df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
df['target'] = california_housing.target

# Select relevant features and rename for clarity
df = df[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'target']]
df.columns = ['Income', 'HouseAge', 'AvgRooms', 'AvgBedrooms', 'Population', 'AvgOccupancy', 'Latitude', 'Longitude', 'Price']

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Visualizations
plt.figure(figsize=(15, 10))

# Distribution of target variable
plt.subplot(2, 3, 1)
plt.hist(df['Price'], bins=30, edgecolor='black')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')

# Correlation heatmap
plt.subplot(2, 3, 2)
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')

# Scatter plots for key features
plt.subplot(2, 3, 3)
plt.scatter(df['Income'], df['Price'], alpha=0.6)
plt.xlabel('Median Income')
plt.ylabel('Price')
plt.title('Income vs Price')

plt.subplot(2, 3, 4)
plt.scatter(df['HouseAge'], df['Price'], alpha=0.6)
plt.xlabel('House Age')
plt.ylabel('Price')
plt.title('House Age vs Price')

plt.subplot(2, 3, 5)
plt.scatter(df['AvgRooms'], df['Price'], alpha=0.6)
plt.xlabel('Average Rooms')
plt.ylabel('Price')
plt.title('Average Rooms vs Price')

plt.tight_layout()
plt.show()

# Prepare the data
X = df.drop('Price', axis=1)
y = df['Price']

print("\nFeatures shape:", X.shape)
print("Target shape:", y.shape)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\n" + "="*50)
print("MODEL EVALUATION RESULTS")
print("="*50)
print(f"Training MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Training RMSE: {np.sqrt(train_mse):.4f}")
print(f"Test RMSE: {np.sqrt(test_mse):.4f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Training MAE: {train_mae:.4f}")
print(f"Test MAE: {test_mae:.4f}")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})
feature_importance['Abs_Coefficient'] = abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

print("\nFeature Importance (Coefficients):")
print(feature_importance)

# Visualization of results
plt.figure(figsize=(15, 5))

# Actual vs Predicted
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_test_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')

# Residuals plot
plt.subplot(1, 3, 2)
residuals = y_test - y_test_pred
plt.scatter(y_test_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.title('Residuals Plot')

# Feature importance plot
plt.subplot(1, 3, 3)
plt.barh(feature_importance['Feature'], feature_importance['Abs_Coefficient'])
plt.xlabel('Absolute Coefficient Value')
plt.title('Feature Importance')

plt.tight_layout()
plt.show()

# Save the model and scaler
with open('mumbai_house_price_model.pkl', 'wb') as f:
    pickle.dump({
        'model': model,
        'scaler': scaler,
        'feature_names': list(X.columns)
    }, f)

print("\n" + "="*50)
print("MODEL SAVED SUCCESSFULLY!")
print("="*50)
print("Files created:")
print("- mumbai_house_price_model.pkl (contains model, scaler, and feature names)")
print("\nModel is ready for deployment in Flask app!")

# Test loading the saved model
print("\nTesting model loading...")
with open('mumbai_house_price_model.pkl', 'rb') as f:
    loaded_data = pickle.load(f)
    loaded_model = loaded_data['model']
    loaded_scaler = loaded_data['scaler']
    loaded_features = loaded_data['feature_names']

print("Model loaded successfully!")
print(f"Feature names: {loaded_features}")

# Make a sample prediction
sample_input = X_test.iloc[0:1]
sample_scaled = loaded_scaler.transform(sample_input)
sample_prediction = loaded_model.predict(sample_scaled)
actual_price = y_test.iloc[0]

print(f"\nSample Prediction Test:")
print(f"Input features: {sample_input.values[0]}")
print(f"Predicted price: ₹{sample_prediction[0]:.2f} lakh")
print(f"Actual price: ₹{actual_price:.2f} lakh")
print(f"Prediction error: ₹{abs(sample_prediction[0] - actual_price):.2f} lakh")
