### Assignment-4

**Objective:**

Understand and implement model evaluation using cross-validation and improve model performance by hyperparameter tuning.

Step 1: Import Libraries and Load Data

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

url = "https://raw.githubusercontent.com/springboardmentor943x/ImpactSense-Intern-project/refs/heads/main/Milestone_2/Week_4/Day_18/preprocessed_earthquake_data.csv"
data = pd.read_csv(url)

print(data.head())
print(data.info())


   Latitude  Longitude        Type     Depth  Magnitude Magnitude Type  \
0  0.583377   0.844368  Earthquake  0.495984   0.277668             MW   
1  0.006109   0.698849  Earthquake  0.075272  -0.195082             MW   
2 -0.739162  -1.701962  Earthquake -0.413928   0.750418             MW   
3 -2.017599  -0.503524  Earthquake -0.454694  -0.195082             MW   
4  0.340688   0.691479  Earthquake -0.454694  -0.195082             MW   

   Root Mean Square  Source     Status      Year  ...  Source_ISCGEM  \
0         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
1         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
2         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
3         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   
4         -0.103839  ISCGEM  Automatic -1.915523  ...            1.0   

   Source_ISCGEMSUP  Source_NC  Source_NN  Source_OFFICIAL  Source_PR  \
0               0.0        0.0        0.0        

Step 2: Load Dataset and Prepare Features and Target

In [None]:
import pandas as pd


url = "https://raw.githubusercontent.com/springboardmentor943x/ImpactSense-Intern-project/refs/heads/main/Milestone_2/Week_4/Day_18/preprocessed_earthquake_data.csv"
data = pd.read_csv(url)


print(data.columns.tolist())


print(data.head(5))


print(data.info())


['Latitude', 'Longitude', 'Type', 'Depth', 'Magnitude', 'Magnitude Type', 'Root Mean Square', 'Source', 'Status', 'Year', 'Day', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'Type_Explosion', 'Type_Nuclear Explosion', 'Type_Rock Burst', 'Magnitude Type_MD', 'Magnitude Type_MH', 'Magnitude Type_ML', 'Magnitude Type_MS', 'Magnitude Type_MW', 'Magnitude Type_MWB', 'Magnitude Type_MWC', 'Magnitude Type_MWR', 'Magnitude Type_MWW', 'Source_ATLAS', 'Source_CI', 'Source_GCMT', 'Source_ISCGEM', 'Source_ISCGEMSUP', 'Source_NC', 'Source_NN', 'Source_OFFICIAL', 'Source_PR', 'Source_SE', 'Source_US', 'Source_UW', 'Status_Reviewed']
   Latitude  Longitude        Type     Depth  Magnitude Magnitude Type  \
0  0.583377   0.844368  Earthquake  0.495984   0.277668             MW   
1  0.006109   0.698849  Earthquake  0.075272  -0.195082             MW   
2 -0.739162  -1.701962  Earthquake -0.413928   0.750418             MW   
3 -2.017599  -0.503524  Earthquake -0.454694  -0.195082             MW  

Step 3: Implement Cross-Validation

In [None]:

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix)


url = "https://raw.githubusercontent.com/springboardmentor943x/ImpactSense-Intern-project/refs/heads/main/Milestone_2/Week_4/Day_18/preprocessed_earthquake_data.csv"
data = pd.read_csv(url)


possible_targets = ['target', 'label', 'class', 'quake_class', 'magnitude_class', 'alert_level']
cols = data.columns.tolist()

target_col = None
for t in possible_targets:
    if t in cols:
        target_col = t
        break
if target_col is None:
    
    target_col = cols[-1]

print("Using target column:", target_col)
print("All columns:", cols)


X = data.drop(columns=[target_col])
y = data[target_col]


if y.dtype != 'object' and len(np.unique(y)) < 20:
    
    pass
else:
    
    if y.dtype == 'object' or y.dtype.name == 'category':
        y = y.astype('category').cat.codes

print("X shape:", X.shape, "y shape:", y.shape)
print("Number of classes:", len(np.unique(y)), "Class distribution:\n", pd.Series(y).value_counts())


numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)


numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
], remainder='drop')  

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']


cv_results = cross_validate(clf, X, y, cv=cv, scoring=scoring, return_train_score=False, n_jobs=-1)


for metric in scoring:
    key = f'test_{metric}'
    mean = np.mean(cv_results[key])
    std = np.std(cv_results[key])
    print(f"{metric}: {mean:.4f} ± {std:.4f}")


y_pred_cv = cross_val_predict(clf, X, y, cv=cv, n_jobs=-1)
print("\nOverall classification report (cross-validated predictions):\n")
print(classification_report(y, y_pred_cv, digits=4))


cm = confusion_matrix(y, y_pred_cv)
print("Confusion matrix:\n", cm)


Using target column: Status_Reviewed
All columns: ['Latitude', 'Longitude', 'Type', 'Depth', 'Magnitude', 'Magnitude Type', 'Root Mean Square', 'Source', 'Status', 'Year', 'Day', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'Type_Explosion', 'Type_Nuclear Explosion', 'Type_Rock Burst', 'Magnitude Type_MD', 'Magnitude Type_MH', 'Magnitude Type_ML', 'Magnitude Type_MS', 'Magnitude Type_MW', 'Magnitude Type_MWB', 'Magnitude Type_MWC', 'Magnitude Type_MWR', 'Magnitude Type_MWW', 'Source_ATLAS', 'Source_CI', 'Source_GCMT', 'Source_ISCGEM', 'Source_ISCGEMSUP', 'Source_NC', 'Source_NN', 'Source_OFFICIAL', 'Source_PR', 'Source_SE', 'Source_US', 'Source_UW', 'Status_Reviewed']
X shape: (23409, 39) y shape: (23409,)
Number of classes: 2 Class distribution:
 Status_Reviewed
1.0    20770
0.0     2639
Name: count, dtype: int64
Numeric columns: ['Latitude', 'Longitude', 'Depth', 'Magnitude', 'Root Mean Square', 'Year', 'Day', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'Type_Explosion', '

Step 4: Hyperparameter Tuning with GridSearchCV

In [None]:


import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import time


url = "https://raw.githubusercontent.com/springboardmentor943x/ImpactSense-Intern-project/refs/heads/main/Milestone_2/Week_4/Day_18/preprocessed_earthquake_data.csv"
data = pd.read_csv(url)


possible_targets = ['target', 'label', 'class', 'quake_class', 'magnitude_class', 'alert_level']
cols = data.columns.tolist()
target_col = next((t for t in possible_targets if t in cols), None)
if target_col is None:
    target_col = cols[-1]  
print("Using target column:", target_col)

X = data.drop(columns=[target_col])
y = data[target_col]


if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
], remainder='drop')


pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


param_grid = {
    'classifier__n_estimators': [100, 200, 400],          
    'classifier__max_depth': [None, 10, 20, 40],          
    'classifier__min_samples_split': [2, 5, 10],          
    'classifier__min_samples_leaf': [1, 2, 4],           
    'classifier__max_features': ['sqrt', 'log2', 0.5]     
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',  
    n_jobs=-1,
    verbose=2,
    refit=True,
    return_train_score=True
)


start = time.time()
grid_search.fit(X_train, y_train)
end = time.time()
print(f"GridSearchCV done in {(end-start)/60:.2f} minutes")

print("Best parameters found:")
print(grid_search.best_params_)
print(f"Best cross-val score (accuracy): {grid_search.best_score_:.4f}")


best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

print("\nTest set results")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification report:\n", classification_report(y_test, y_pred_test, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_test))


joblib.dump(best_model, 'best_rf_pipeline.joblib')
print("Best model saved to 'best_rf_pipeline.joblib'")


"""
from scipy.stats import randint
param_dist = {
    'classifier__n_estimators': randint(100, 500),
    'classifier__max_depth': [None] + list(range(5, 51, 5)),
    'classifier__min_samples_split': randint(2, 11),
    'classifier__min_samples_leaf': randint(1, 5),
    'classifier__max_features': ['sqrt', 'log2', 0.2, 0.5, None]
}
random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=40,            # number of parameter settings sampled
    cv=cv,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=2,
    refit=True
)
random_search.fit(X_train, y_train)
print("RandomizedSearch best params:", random_search.best_params_)
"""


Using target column: Status_Reviewed
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
GridSearchCV done in 9.75 minutes
Best parameters found:
{'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best cross-val score (accuracy): 1.0000

Test set results
Accuracy: 1.0

Classification report:
               precision    recall  f1-score   support

         0.0     1.0000    1.0000    1.0000       528
         1.0     1.0000    1.0000    1.0000      4154

    accuracy                         1.0000      4682
   macro avg     1.0000    1.0000    1.0000      4682
weighted avg     1.0000    1.0000    1.0000      4682

Confusion matrix:
 [[ 528    0]
 [   0 4154]]
Best model saved to 'best_rf_pipeline.joblib'


'\nfrom scipy.stats import randint\nparam_dist = {\n    \'classifier__n_estimators\': randint(100, 500),\n    \'classifier__max_depth\': [None] + list(range(5, 51, 5)),\n    \'classifier__min_samples_split\': randint(2, 11),\n    \'classifier__min_samples_leaf\': randint(1, 5),\n    \'classifier__max_features\': [\'sqrt\', \'log2\', 0.2, 0.5, None]\n}\nrandom_search = RandomizedSearchCV(\n    estimator=pipe,\n    param_distributions=param_dist,\n    n_iter=40,            # number of parameter settings sampled\n    cv=cv,\n    scoring=\'accuracy\',\n    random_state=42,\n    n_jobs=-1,\n    verbose=2,\n    refit=True\n)\nrandom_search.fit(X_train, y_train)\nprint("RandomizedSearch best params:", random_search.best_params_)\n'

Step 5: Evaluate Best Model on Full Dataset

In [None]:


import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix, roc_auc_score, roc_curve)
import joblib
import os
import time


url = "https://raw.githubusercontent.com/springboardmentor943x/ImpactSense-Intern-project/refs/heads/main/Milestone_2/Week_4/Day_18/preprocessed_earthquake_data.csv"
data = pd.read_csv(url)


possible_targets = ['target', 'label', 'class', 'quake_class', 'magnitude_class', 'alert_level']
cols = data.columns.tolist()
target_col = next((t for t in possible_targets if t in cols), None)
if target_col is None:
    target_col = cols[-1]  
print("Using target column:", target_col)


X = data.drop(columns=[target_col])
y_raw = data[target_col]


label_mapping = None
if y_raw.dtype == 'object' or y_raw.dtype.name == 'category':
    y = y_raw.astype('category').cat.codes
    label_mapping = dict(enumerate(y_raw.astype('category').cat.categories))
else:
    y = y_raw.copy()

print("Feature matrix shape:", X.shape, "Target shape:", y.shape)
if label_mapping is not None:
    print("Label mapping (code -> original):", label_mapping)


best_model = None
saved_model_path = 'best_rf_pipeline.joblib'

if os.path.exists(saved_model_path):
    print(f"Loading saved model from '{saved_model_path}'...")
    best_model = joblib.load(saved_model_path)
else:
    
    try:
        
        best_model = grid_search.best_estimator_
        print("Using grid_search.best_estimator_ from the current session.")
    except Exception:
        raise RuntimeError(
            "No saved model found and 'grid_search' not available in memory. "
            "Run GridSearchCV (Step 4) or place 'best_rf_pipeline.joblib' in the working directory."
        )


print("Refitting the best model on the full dataset...")
start = time.time()
best_model.fit(X, y)
end = time.time()
print(f"Refit completed in {(end-start):.2f}s")


final_model_path = 'final_best_model_full_dataset.joblib'
joblib.dump(best_model, final_model_path)
print(f"Final model saved to '{final_model_path}'")


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("\nGenerating cross-validated predictions for the full dataset (this gives honest estimates)...")
y_pred_cv = cross_val_predict(best_model, X, y, cv=cv, n_jobs=-1, method='predict')


y_proba_cv = None
try:
    y_proba_cv = cross_val_predict(best_model, X, y, cv=cv, n_jobs=-1, method='predict_proba')
except Exception:
    
    y_proba_cv = None


accuracy = accuracy_score(y, y_pred_cv)
precision_macro = precision_score(y, y_pred_cv, average='macro', zero_division=0)
recall_macro = recall_score(y, y_pred_cv, average='macro', zero_division=0)
f1_macro = f1_score(y, y_pred_cv, average='macro', zero_division=0)

precision_weighted = precision_score(y, y_pred_cv, average='weighted', zero_division=0)
recall_weighted = recall_score(y, y_pred_cv, average='weighted', zero_division=0)
f1_weighted = f1_score(y, y_pred_cv, average='weighted', zero_division=0)

print("\n=== Cross-Validated Metrics (full dataset) ===")
print(f"Accuracy:        {accuracy:.4f}")
print(f"Precision (macro): {precision_macro:.4f}")
print(f"Recall (macro):    {recall_macro:.4f}")
print(f"F1 (macro):        {f1_macro:.4f}")
print(f"Precision (weighted): {precision_weighted:.4f}")
print(f"Recall (weighted):    {recall_weighted:.4f}")
print(f"F1 (weighted):        {f1_weighted:.4f}")


print("\nClassification Report (cross-validated predictions):\n")
if label_mapping is not None:
    
    target_names = [label_mapping[i] for i in sorted(label_mapping.keys())]
    print(classification_report(y, y_pred_cv, digits=4, target_names=target_names, zero_division=0))
else:
    print(classification_report(y, y_pred_cv, digits=4, zero_division=0))

cm = confusion_matrix(y, y_pred_cv)
print("Confusion matrix (rows=true, cols=pred):\n", cm)


if y_proba_cv is not None:
    
    try:
        if len(np.unique(y)) == 2:
           
            auc = roc_auc_score(y, y_proba_cv[:, 1])
            print(f"\nROC AUC (binary): {auc:.4f}")
        else:
            auc_macro = roc_auc_score(pd.get_dummies(y), y_proba_cv, average='macro', multi_class='ovr')
            print(f"\nROC AUC (multiclass, macro): {auc_macro:.4f}")
    except Exception as e:
        print("Could not compute ROC AUC:", str(e))
else:
    print("\npredict_proba not available for this classifier/pipeline — skipping ROC AUC.")


from collections import Counter
print("\nClass distribution (full dataset):", Counter(y))

print("\nStep 5 complete — final model refit on full data and cross-validated evaluation finished.")


Using target column: Status_Reviewed
Feature matrix shape: (23409, 39) Target shape: (23409,)
Loading saved model from 'best_rf_pipeline.joblib'...
Refitting the best model on the full dataset...
Refit completed in 0.69s
Final model saved to 'final_best_model_full_dataset.joblib'

Generating cross-validated predictions for the full dataset (this gives honest estimates)...

=== Cross-Validated Metrics (full dataset) ===
Accuracy:        1.0000
Precision (macro): 1.0000
Recall (macro):    1.0000
F1 (macro):        1.0000
Precision (weighted): 1.0000
Recall (weighted):    1.0000
F1 (weighted):        1.0000

Classification Report (cross-validated predictions):

              precision    recall  f1-score   support

         0.0     1.0000    1.0000    1.0000      2639
         1.0     1.0000    1.0000    1.0000     20770

    accuracy                         1.0000     23409
   macro avg     1.0000    1.0000    1.0000     23409
weighted avg     1.0000    1.0000    1.0000     23409

Confus