<a href="https://colab.research.google.com/github/Raghuram1999/IT1703/blob/main/Assignment5_Raghu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing the data and splitting into training and testing sets:

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('/content/real_estate_value.csv')


X = data.drop('UnitPrice', axis=1)
y = data['UnitPrice']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Examining the data and preprocessing:

In [2]:
# Check for missing values
print(data.isnull().sum())

# Check data types
print(data.dtypes)

# Check for outliers and distribution
print(data.describe())

# Correlation analysis
print(data.corr())

# Preprocessing steps
#  Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

HouseAge         0
DistanceToMRT    0
NoOfStores       0
Latitude         0
Longitude        0
UnitPrice        0
dtype: int64
HouseAge         float64
DistanceToMRT    float64
NoOfStores         int64
Latitude         float64
Longitude        float64
UnitPrice        float64
dtype: object
         HouseAge  DistanceToMRT  NoOfStores    Latitude   Longitude  \
count  414.000000     414.000000  414.000000  414.000000  414.000000   
mean    17.712560    1083.885689    4.094203   24.969030  121.533361   
std     11.392485    1262.109595    2.945562    0.012410    0.015347   
min      0.000000      23.382840    0.000000   24.932070  121.473530   
25%      9.025000     289.324800    1.000000   24.963000  121.528085   
50%     16.100000     492.231300    4.000000   24.971100  121.538630   
75%     28.150000    1454.279000    6.000000   24.977455  121.543305   
max     43.800000    6488.021000   10.000000   25.014590  121.566270   

        UnitPrice  
count  414.000000  
mean    37.980193  


#Justification for preprocessing steps:
a) Checking for missing values: Ensure data completeness.

b) Checking data types: Confirm all features are numerical.

c) Outlier detection: Identify potential anomalies that might affect model performance.

d) Correlation analysis: Understand relationships between features and the target variable.

e) Standardization: Scale features to have zero mean and unit variance, which is important for many machine learning algorithms, especially when features are on different scales.



#Fine-tuning Decision Tree and Random Forest models:

In [3]:
# Decision Tree
dt_params = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeRegressor(random_state=42)
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='neg_mean_squared_error')
dt_grid.fit(X_train_scaled, y_train)

# Random Forest
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='neg_mean_squared_error')
rf_grid.fit(X_train_scaled, y_train)

# Evaluate models
def evaluate_model(model, X, y, model_name):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"{model_name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# Decision Tree performance
print("Decision Tree - Best parameters:", dt_grid.best_params_)
evaluate_model(dt_grid.best_estimator_, X_test_scaled, y_test, "Decision Tree")

# Random Forest performance
print("Random Forest - Best parameters:", rf_grid.best_params_)
evaluate_model(rf_grid.best_estimator_, X_test_scaled, y_test, "Random Forest")

Decision Tree - Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Decision Tree - MSE: 45.66, R2: 0.73
Random Forest - Best parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Random Forest - MSE: 32.73, R2: 0.80
