In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [None]:
data = pd.read_csv('preprocessed_earthquake_data.csv')
target = 'Status_Reviewed'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']
X = data.drop(columns=[target] + categorical_cols)
y = data[target]

In [None]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42)
}

In [None]:
for name, model in models.items():
    print(f"\nCross-validation for {name}:")
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    print(f"Accuracy Scores for each fold: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Accuracy Variance: {np.var(scores):.6f}")


Cross-validation for RandomForest:
Accuracy Scores for each fold: [0.9987185  1.         1.         1.         1.         1.
 1.         1.         1.         0.97350427]
Mean Accuracy: 0.9972
Accuracy Variance: 0.000063

Cross-validation for LogisticRegression:
Accuracy Scores for each fold: [0.99700982 1.         1.         1.         1.         1.
 1.         1.         1.         0.97521368]
Mean Accuracy: 0.9972
Accuracy Variance: 0.000055


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
grid_searches = {
    'RandomForest': GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy'),
    'LogisticRegression': GridSearchCV(LogisticRegression(max_iter=500, random_state=42), param_grid_lr, cv=5, scoring='accuracy')
}
for name, gs in grid_searches.items():
    gs.fit(X, y)
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Best cross-validation accuracy for {name}: {gs.best_score_:.4f}")


Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy for RandomForest: 0.9974

Best parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy for LogisticRegression: 0.9976


In [None]:
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
param_grid_lr = {
    'fit_intercept': [True, False]
}
grid_searches = {
    'XGBRegressor': GridSearchCV(XGBRegressor(random_state=42, verbosity=0),
                                 param_grid_xgb, cv=5, scoring='r2'),
    'LinearRegression': GridSearchCV(LinearRegression(),
                                     param_grid_lr, cv=5, scoring='r2')
}
for name, gs in grid_searches.items():
    gs.fit(X, y)
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Best cross-validation R² for {name}: {gs.best_score_:.4f}")


Best parameters for XGBRegressor: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Best cross-validation R² for XGBRegressor: 0.9618

Best parameters for LinearRegression: {'fit_intercept': True}
Best cross-validation R² for LinearRegression: -49.0677
