In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [8]:
data = pd.read_csv('preprocessed_earthquake_data.csv')
data.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,Root Mean Square,Source,Status,Year,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
0,0.583377,0.844368,Earthquake,0.495984,0.277668,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.006109,0.698849,Earthquake,0.075272,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.739162,-1.701962,Earthquake,-0.413928,0.750418,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.017599,-0.503524,Earthquake,-0.454694,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.340688,0.691479,Earthquake,-0.454694,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Define the target and features
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

X = data.drop(columns=[target] + categorical_cols)
y = data[target]

In [10]:
# Train-test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,Latitude,Longitude,Depth,Root Mean Square,Year,Day,Month_sin,Month_cos,Hour_sin,Hour_cos,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
96,-1.117575,-0.886094,-0.179112,-0.103839,-1.915523,0.72881,1.406827,-0.00635,-0.713894,1.234637,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1616,2.047086,0.848599,-0.495461,-0.103839,-1.568919,-1.231834,0.00141,-1.429558,0.696725,-1.221272,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2584,-1.862581,-0.273151,-0.307934,-0.103839,-1.360956,-0.42451,0.704119,1.226184,0.988874,1.009303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1407,1.686838,1.065842,-0.353593,-0.103839,-1.63824,0.267482,-1.215716,0.705254,1.213048,-0.702278,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2619,-1.059759,-1.730955,-0.177481,-0.103839,-1.360956,0.267482,1.218537,0.705254,-1.419204,0.006682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [11]:
gbm = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
gbm.fit(X_train_imputed, y_train)

In [15]:
y_pred = gbm.predict(X_test)



In [16]:
mse =  mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")


Mean Squared Error: 1.0311253083004073
Mean Absolute Error: 0.7484146515907834


In [17]:
cv_scores = cross_val_score(gbm, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-Validation Scores: {-np.mean(cv_scores):.4f}")

Cross-Validation Scores: nan


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_response.py", line 242, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/ensemble/_gb.py", line 2144, in predict
    X = validate_data(
        ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 2944, in validate_data
    out = check_array(X, i

- **Task:** Understand and apply Gradient Boosting Machine to predict a categorical target variable, evaluate performance, and interpret results and notedown your observations.

In [22]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
gbm_classifier = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gbm_classifier.fit(X_train_imputed, y_train_cat)

y_pred_classifier = gbm_classifier.predict(X_test_imputed)

accuracy = accuracy_score(y_test_cat, y_pred_classifier)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_cat, y_pred_classifier))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_cat, y_pred_classifier))

cv_scores_classifier = cross_val_score(gbm_classifier, X_imputed, y_categorical, cv=5, scoring='accuracy')
print(f"\nCross-Validation Accuracy Scores: {cv_scores_classifier}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores_classifier):.4f}")

Accuracy: 0.7467

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.97      0.86       462
           1       0.15      0.02      0.03       102
           2       0.00      0.00      0.00        31
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00         1

    accuracy                           0.75       604
   macro avg       0.18      0.20      0.18       604
weighted avg       0.61      0.75      0.66       604


Confusion Matrix:
[[449   9   2   2   0]
 [ 99   2   0   1   0]
 [ 29   2   0   0   0]
 [  8   0   0   0   0]
 [  1   0   0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Cross-Validation Accuracy Scores: [0.15066225 0.46523179 0.37645108 0.41127695 0.74295191]
Mean Cross-Validation Accuracy: 0.4293
