# ML WORSHOP DAY 2 : MODEL TRAINING AND ANALYSIS
### **Objectives :**
- Understand train test split
- Understand various regression models : Linear, Decision Tree, Random Forest, KNN, SVM, Naive Bayes
- Evaluate and compare performance of model based on various evaluation metrics

In [None]:
#Importing the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
#Importing the required models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [None]:
#To import the pre processed dataset

from google.colab import files
uploaded = files.upload()

In [None]:
#Loading the preprocessed dataset into a pandas dataframe
df = pd.read_csv('preprocessed_data.csv')
df.shape

Now in our dataset, price is the target i.e. what we are predicting.  
So we will seperate price column from the other columns

In [None]:
df.head()

In [None]:
#Dropping the size column as it is a text based column and not required because we have already extracted the numerical features from it
df1=df.drop(['size'],axis='columns')

In [None]:
# Features: drop 'price' (target), keep all others
feature_cols = [col for col in df1.columns if col != 'price']
X = df1[feature_cols]
y = df1['price']


In [None]:
print(f"‚úì Feature matrix shape: {X.shape}")
print(f"‚úì Target (price) shape: {y.shape}")

Performing train test split - to split our dataset into training and testing values.  
Training values - the values that our model will learn and understand  
Testing values - the values that our model will predict

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#We will store the results of regression into this dictionary
regression_results = {}

In [None]:
# 1. LINEAR REGRESSION
print("\n1. Linear Regression")
print("-"*40)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

regression_results['Linear Regression'] = {
    'MSE': mse_lr, 'RMSE': rmse_lr, 'MAE': mae_lr, 'R¬≤ Score': r2_lr
}

print(f"  Mean Squared Error: {mse_lr:.4f}")
print(f"  Root Mean Squared Error: {rmse_lr:.4f}")
print(f"  Mean Absolute Error: {mae_lr:.4f}")
print(f"  R¬≤ Score: {r2_lr:.4f}")


In [None]:
# 2. DECISION TREE REGRESSOR
print("\n2. Decision Tree Regressor")
print("-"*40)
dtr = DecisionTreeRegressor(random_state=42, max_depth=10)
dtr.fit(X_train, y_train)
y_pred_dtr = dtr.predict(X_test)

mse_dtr = mean_squared_error(y_test, y_pred_dtr)
rmse_dtr = np.sqrt(mse_dtr)
mae_dtr = mean_absolute_error(y_test, y_pred_dtr)
r2_dtr = r2_score(y_test, y_pred_dtr)

regression_results['Decision Tree'] = {
    'MSE': mse_dtr, 'RMSE': rmse_dtr, 'MAE': mae_dtr, 'R¬≤ Score': r2_dtr
}

print(f"  Mean Squared Error: {mse_dtr:.4f}")
print(f"  Root Mean Squared Error: {rmse_dtr:.4f}")
print(f"  Mean Absolute Error: {mae_dtr:.4f}")
print(f"  R¬≤ Score: {r2_dtr:.4f}")


In [None]:
# 3. RANDOM FOREST REGRESSOR
print("\n3. Random Forest Regressor")
print("-"*40)
rfr = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rfr.fit(X_train, y_train)
y_pred_rfr = rfr.predict(X_test)

mse_rfr = mean_squared_error(y_test, y_pred_rfr)
rmse_rfr = np.sqrt(mse_rfr)
mae_rfr = mean_absolute_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

regression_results['Random Forest'] = {
    'MSE': mse_rfr, 'RMSE': rmse_rfr, 'MAE': mae_rfr, 'R¬≤ Score': r2_rfr
}

print(f"  Mean Squared Error: {mse_rfr:.4f}")
print(f"  Root Mean Squared Error: {rmse_rfr:.4f}")
print(f"  Mean Absolute Error: {mae_rfr:.4f}")
print(f"  R¬≤ Score: {r2_rfr:.4f}")


In [None]:
# 4. SUPPORT VECTOR REGRESSOR (SVR)
print("\n4. Support Vector Regressor (SVR)")
print("-"*40)
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)

mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

regression_results['SVR'] = {
    'MSE': mse_svr, 'RMSE': rmse_svr, 'MAE': mae_svr, 'R¬≤ Score': r2_svr
}

print(f"  Mean Squared Error: {mse_svr:.4f}")
print(f"  Root Mean Squared Error: {rmse_svr:.4f}")
print(f"  Mean Absolute Error: {mae_svr:.4f}")
print(f"  R¬≤ Score: {r2_svr:.4f}")


In [None]:
# 5. K-NEAREST NEIGHBORS REGRESSOR
print("\n5. K-Nearest Neighbors Regressor")
print("-"*40)
knr = KNeighborsRegressor(n_neighbors=5)
knr.fit(X_train, y_train)
y_pred_knr = knr.predict(X_test)

mse_knr = mean_squared_error(y_test, y_pred_knr)
rmse_knr = np.sqrt(mse_knr)
mae_knr = mean_absolute_error(y_test, y_pred_knr)
r2_knr = r2_score(y_test, y_pred_knr)

regression_results['KNN'] = {
    'MSE': mse_knr, 'RMSE': rmse_knr, 'MAE': mae_knr, 'R¬≤ Score': r2_knr
}

print(f"  Mean Squared Error: {mse_knr:.4f}")
print(f"  Root Mean Squared Error: {rmse_knr:.4f}")
print(f"  Mean Absolute Error: {mae_knr:.4f}")
print(f"  R¬≤ Score: {r2_knr:.4f}")


In [None]:
# ===================================================
# SECTION 6: MODEL COMPARISON
# ===================================================
print("\n" + "="*80)
print("SECTION 6: MODEL COMPARISON & RESULTS")
print("="*80)

print("\nREGRESSION MODELS COMPARISON")
print("-"*80)
reg_df = pd.DataFrame(regression_results).T
reg_df = reg_df.round(4)
print(reg_df.to_string())

print("\nüèÜ BEST REGRESSION MODEL:")
best_reg_model = reg_df['R¬≤ Score'].idxmax()
print(f"  {best_reg_model} with R¬≤ Score of {reg_df.loc[best_reg_model, 'R¬≤ Score']:.4f}")


In [None]:
# ===================================================
# SECTION 7: VISUALIZATIONS
# ===================================================
print("\n" + "="*80)
print("SECTION 7: GENERATING COMPARISON VISUALIZATIONS")
print("="*80)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Regression Models Performance Comparison', fontsize=16, fontweight='bold')

axes[0, 0].barh(reg_df.index, reg_df['R¬≤ Score'], color='steelblue')
axes[0, 0].set_xlabel('R¬≤ Score')
axes[0, 0].set_title('R¬≤ Score Comparison')
axes[0, 0].grid(axis='x', alpha=0.3)

axes[0, 1].barh(reg_df.index, reg_df['RMSE'], color='coral')
axes[0, 1].set_xlabel('RMSE (lower is better)')
axes[0, 1].set_title('RMSE Comparison')
axes[0, 1].grid(axis='x', alpha=0.3)

axes[1, 0].barh(reg_df.index, reg_df['MAE'], color='mediumseagreen')
axes[1, 0].set_xlabel('MAE (lower is better)')
axes[1, 0].set_title('MAE Comparison')
axes[1, 0].grid(axis='x', alpha=0.3)

axes[1, 1].barh(reg_df.index, reg_df['MSE'], color='mediumpurple')
axes[1, 1].set_xlabel('MSE (lower is better)')
axes[1, 1].set_title('MSE Comparison')
axes[1, 1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úì Visualizations generated successfully!")



In [None]:
# ===================================================
# SECTION 8: KEY INSIGHTS & RECOMMENDATIONS
# ===================================================
print("\n" + "="*80)
print("SECTION 8: KEY INSIGHTS & RECOMMENDATIONS")
print("="*80)

print("\nüìä REGRESSION ANALYSIS:")
print(f"  ‚Ä¢ Best Model: {best_reg_model}")
print(f"  ‚Ä¢ R¬≤ Score: {reg_df.loc[best_reg_model, 'R¬≤ Score']:.4f}")
print(f"  ‚Ä¢ This model explains {reg_df.loc[best_reg_model, 'R¬≤ Score']*100:.2f}% of variance in house prices")

