# House Price Prediction - Model Analysis

This notebook contains the exploratory data analysis (EDA) and model comparison for the Tirupati House Price Prediction project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_csv('data/tirupati_houses.csv')

# Display basic information
print("Dataset Info:")
print("-" * 50)
df.info()

print("\nSample Data:")
print("-" * 50)
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Distribution of house prices
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=30)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()

# Price vs Area
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='area', y='price')
plt.title('Price vs Area')
plt.show()

# Average price by property type
plt.figure(figsize=(12, 6))
df.groupby('property_type')['price'].mean().plot(kind='bar')
plt.title('Average Price by Property Type')
plt.xticks(rotation=45)
plt.show()

## 3. Feature Engineering

In [None]:
# Calculate price per square foot
df['price_per_sqft'] = df['price'] / df['area']

# Create feature for total rooms
df['total_rooms'] = df['bedRoom'] + df['bathroom'] + df['balcony'] + df['additionalRoom']

# Display correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

## 4. Model Training and Comparison

In [None]:
# Prepare features
X = df[['area', 'bedRoom', 'bathroom', 'balcony', 'additionalRoom', 'price_per_sqft', 'total_rooms']]
y = df['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Train and evaluate models
results = []
for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'RMSE': rmse,
        'R2 Score': r2
    })

# Display results
results_df = pd.DataFrame(results)
print("Model Performance Comparison:")
print("-" * 50)
print(results_df)

# Plot model comparison
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df, x='Model', y='R2 Score')
plt.title('Model R² Score Comparison')
plt.xticks(rotation=45)
plt.show()