### Assignment-4

**Objective:**

Understand and implement model evaluation using cross-validation and improve model performance by hyperparameter tuning.

Step 1: Import Libraries and Load Data

In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


Step 2: Load Dataset and Prepare Features and Target

In [2]:


# Upload CSV manually
from google.colab import files
uploaded = files.upload()


df = pd.read_csv(list(uploaded.keys())[0])


drop_cols = ["Type", "Magnitude Type", "Source", "Status"]
X = df.drop(columns=drop_cols)

y = df["Type"]

print("Shape of Features:", X.shape)
print("Shape of Target:", y.shape)
print("Target classes:", y.unique())


Saving preprocessed_earthquake_data (1).csv to preprocessed_earthquake_data (1).csv
Shape of Features: (23409, 36)
Shape of Target: (23409,)
Target classes: ['Earthquake' 'Nuclear Explosion' 'Explosion' 'Rock Burst']


Step 3: Implement Cross-Validation

In [3]:

model = RandomForestClassifier(random_state=42)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print("Average CV Accuracy:", np.mean(cv_scores))




Cross-validation scores: [1.         1.         1.         0.99978642 1.        ]
Average CV Accuracy: 0.9999572832123025


Step 4: Hyperparameter Tuning with GridSearchCV

In [4]:

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Score: 0.9999572832123025


Step 5: Evaluate Best Model on Full Dataset

In [7]:
# Step 5: Evaluate Best Model on Test Data

# Drop classes with fewer than 2 samples
class_counts = y.value_counts()
valid_classes = class_counts[class_counts > 1].index

X_filtered = X[y.isin(valid_classes)]
y_filtered = y[y.isin(valid_classes)]

print("Remaining classes after dropping rare ones:", y_filtered.unique())

# Train-test split (with stratify since now all classes have at least 2 samples)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered,
    test_size=0.2,
    random_state=42,
    stratify=y_filtered
)

# Train best model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
y_pred = best_model.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, accuracy_score

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Remaining classes after dropping rare ones: ['Earthquake' 'Nuclear Explosion' 'Explosion']
Test Accuracy: 1.0

Classification Report:
                    precision    recall  f1-score   support

       Earthquake       1.00      1.00      1.00      4646
        Explosion       1.00      1.00      1.00         1
Nuclear Explosion       1.00      1.00      1.00        35

         accuracy                           1.00      4682
        macro avg       1.00      1.00      1.00      4682
     weighted avg       1.00      1.00      1.00      4682

