### Assignment-4

**Objective:**

Understand and implement model evaluation using cross-validation and improve model performance by hyperparameter tuning.

Step 1: Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
url = "https://raw.githubusercontent.com/springboardmentor943x/ImpactSense-Intern-project/refs/heads/main/Milestone_2/Week_4/Day_18/preprocessed_earthquake_data.csv"
df = pd.read_csv(url)
print(df.columns)

Index(['Latitude', 'Longitude', 'Type', 'Depth', 'Magnitude', 'Magnitude Type',
       'Root Mean Square', 'Source', 'Status', 'Year', 'Day', 'Month_sin',
       'Month_cos', 'Hour_sin', 'Hour_cos', 'Type_Explosion',
       'Type_Nuclear Explosion', 'Type_Rock Burst', 'Magnitude Type_MD',
       'Magnitude Type_MH', 'Magnitude Type_ML', 'Magnitude Type_MS',
       'Magnitude Type_MW', 'Magnitude Type_MWB', 'Magnitude Type_MWC',
       'Magnitude Type_MWR', 'Magnitude Type_MWW', 'Source_ATLAS', 'Source_CI',
       'Source_GCMT', 'Source_ISCGEM', 'Source_ISCGEMSUP', 'Source_NC',
       'Source_NN', 'Source_OFFICIAL', 'Source_PR', 'Source_SE', 'Source_US',
       'Source_UW', 'Status_Reviewed'],
      dtype='object')


Step 2: Load Dataset and Prepare Features and Target

In [2]:
target_column = 'Magnitude'
X = df.drop(columns=[target_column])
y = df[target_column]

print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (23409, 39)
Target shape: (23409,)


Step 3: Implement Cross-Validation

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

df['Magnitude'] = pd.to_numeric(df['Magnitude'], errors='coerce')
df = df.dropna(subset=['Magnitude'])

df_enc = df.copy()
for col in df_enc.columns:
    if df_enc[col].dtype == 'object':
        le = LabelEncoder()
        df_enc[col] = le.fit_transform(df_enc[col].astype(str))

X = df_enc.drop(columns=['Magnitude'])
y = df_enc['Magnitude']

model = RandomForestRegressor(n_estimators=50, random_state=42)

scores = cross_val_score(model, X, y, cv=5, scoring='r2')

print("Cross-validation R^2 scores:", scores)
print("Average R^2 score:", scores.mean())


Cross-validation R^2 scores: [-0.11583774  0.08049321  0.06353796  0.01954318 -1.0587014 ]
Average R^2 score: -0.20219295556246789


Step 4: Hyperparameter Tuning with GridSearchCV

In [4]:
from sklearn.model_selection import GridSearchCV

# Define model
model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

# GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=3, scoring='r2', n_jobs=-1)

grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best R² Score from GridSearch:", grid_search.best_score_)


Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Best R² Score from GridSearch: 0.0010518643067475297


Step 5: Evaluate Best Model on Full Dataset

In [5]:
best_model = grid_search.best_estimator_

best_model.fit(X, y)

y_pred = best_model.predict(X)

from sklearn.metrics import r2_score
print("R² Score on Full Dataset:", r2_score(y, y_pred))


R² Score on Full Dataset: 0.2435913008333631
