In [10]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
avg_rent_data = pd.read_csv('avg_rent.csv')
dist_data = pd.read_csv('dist_from_city_centre.csv')

# Data Exploration
print("Train Data Overview:")
print(train_data.head())
print(train_data.info())

print("\nTest Data Overview:")
print(test_data.head())

print("\nAvg Rent Data Overview:")
print(avg_rent_data.head())

print("\nDistance Data Overview:")
print(dist_data.head())

# Data Cleaning
# Handle missing values
print("\nMissing Values in Train Data:")
print(train_data.isnull().sum())

# Filling missing numerical columns with median
train_data['bath'].fillna(train_data['bath'].median(), inplace=True)
train_data['balcony'].fillna(train_data['balcony'].median(), inplace=True)

# Dropping 'society' due to high cardinality and missing values
train_data.drop(columns=['society'], inplace=True)

# Clean 'total_sqft' (convert range values to average)
def convert_sqft_to_num(x):
    try:
        if '-' in x:
            vals = x.split('-')
            return (float(vals[0]) + float(vals[1])) / 2
        elif 'sqft' in x.lower():
            return float(x.split(' ')[0])
        return float(x)
    except:
        return None

train_data['total_sqft'] = train_data['total_sqft'].apply(convert_sqft_to_num)
test_data['total_sqft'] = test_data['total_sqft'].apply(convert_sqft_to_num)

# Extract numerical value from 'size'
def extract_bhk(x):
    try:
        return int(x.split(' ')[0])
    except:
        return None

train_data['BHK'] = train_data['size'].apply(extract_bhk)
test_data['BHK'] = test_data['size'].apply(extract_bhk)

# Drop 'size' as it's redundant now
train_data.drop(columns=['size'], inplace=True)
test_data.drop(columns=['size'], inplace=True)

# Merge additional datasets based on 'location'
train_data = train_data.merge(avg_rent_data, on='location', how='left')
train_data = train_data.merge(dist_data, on='location', how='left')

test_data = test_data.merge(avg_rent_data, on='location', how='left')
test_data = test_data.merge(dist_data, on='location', how='left')

# Fill missing values in merged columns with median
for col in ['avg_2bhk_rent', 'dist_from_city']:
    train_data[col].fillna(train_data[col].median(), inplace=True)
    test_data[col].fillna(test_data[col].median(), inplace=True)

# One-hot encoding for 'area_type' and 'availability'
train_data = pd.get_dummies(train_data, columns=['area_type', 'availability'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['area_type', 'availability'], drop_first=True)

# Ensure test and train have same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    if col != 'price':
        test_data[col] = 0
test_data = test_data[train_data.columns.drop('price')]

# Feature Engineering
# Create interaction terms and additional features
train_data['price_per_sqft'] = train_data['price'] / train_data['total_sqft']
train_data['rent_to_distance'] = train_data['avg_2bhk_rent'] / train_data['dist_from_city']

# Ensure price_per_sqft exists in test data
test_data['price_per_sqft'] = test_data['total_sqft'] / test_data['avg_2bhk_rent']
test_data['price_per_sqft'].fillna(test_data['price_per_sqft'].median(), inplace=True)

test_data['rent_to_distance'] = test_data['avg_2bhk_rent'] / test_data['dist_from_city']

# Feature and Target Separation
X = train_data.drop(columns=['price', 'location'])
y = np.log1p(train_data['price'])  # Log-transform the target variable

# Fill missing values in X
X.fillna(X.median(), inplace=True)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
rf = RandomForestRegressor(random_state=42)
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(rf, param_dist, n_iter=20, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best model
best_rf = random_search.best_estimator_

# Validation
val_predictions = np.expm1(best_rf.predict(X_val))  # Reverse log-transform
val_actuals = np.expm1(y_val)  # Reverse log-transform
print("Validation RMSE:", np.sqrt(mean_squared_error(val_actuals, val_predictions)))
print("Validation R2 Score:", r2_score(val_actuals, val_predictions))

# Cross-validation
cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='neg_mean_squared_error')
print("Cross-Validation RMSE:", np.sqrt(-cv_scores.mean()))

# Try XGBoost
xgb = XGBRegressor(random_state=42, n_estimators=300, max_depth=10, learning_rate=0.1)
xgb.fit(X_train, y_train)
xgb_val_predictions = np.expm1(xgb.predict(X_val))
print("XGBoost Validation RMSE:", np.sqrt(mean_squared_error(val_actuals, xgb_val_predictions)))

# Predictions on Test Data
# Fill missing values in numeric columns only
numeric_cols = test_data.select_dtypes(include=[np.number]).columns
test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].median())

# Ensure feature alignment for test data
missing_cols = set(X_train.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[X_train.columns]

# Make predictions
test_predictions = np.expm1(best_rf.predict(test_data))

# Save predictions to CSV
output = pd.DataFrame({'Id': test_data.index, 'Predicted Price': test_predictions})
output.to_csv('Submission.csv', index=False)

print("Test predictions saved to 'Submission.csv'")


Train Data Overview:
   ID             area_type   availability                  location  \
0   0  Super built-up  Area         19-Dec  Electronic City Phase II   
1   1            Plot  Area  Ready To Move          Chikka Tirupathi   
2   2        Built-up  Area  Ready To Move               Uttarahalli   
3   3  Super built-up  Area  Ready To Move        Lingadheeranahalli   
4   4  Super built-up  Area  Ready To Move                  Kothanur   

        size  society total_sqft  bath  balcony   price  
0      2 BHK  Coomee        1056   2.0      1.0   39.07  
1  4 Bedroom  Theanmp       2600   5.0      3.0  120.00  
2      3 BHK      NaN       1440   2.0      3.0   62.00  
3      3 BHK  Soiewre       1521   3.0      1.0   95.00  
4      2 BHK      NaN       1200   2.0      1.0   51.00  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10656 entries, 0 to 10655
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0 