In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression

In [2]:
# Read the csv file into a Pandas DataFrame
file_path='earthquake_southeast_asia.csv'
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,year,month,day,hour,minute,second,latitude,longitude,Depth,Magnitude
0,1900,1,5,19,0,0,-3.0,102.0,0.0,6.9
1,1900,4,24,23,16,0,27.0,126.5,35.0,7.0
2,1901,2,15,0,0,0,26.0,100.1,0.0,6.5
3,1901,4,5,21,53,0,2.0,130.0,0.0,6.8
4,1901,6,24,7,2,0,28.0,130.0,35.0,7.5


In [4]:
df_=df.copy()

In [5]:
df_.drop(['year', 'month', 'day', 'hour', 'minute', 'second'], axis=1, inplace=True)

In [6]:
df_.head()

Unnamed: 0,latitude,longitude,Depth,Magnitude
0,-3.0,102.0,0.0,6.9
1,27.0,126.5,35.0,7.0
2,26.0,100.1,0.0,6.5
3,2.0,130.0,0.0,6.8
4,28.0,130.0,35.0,7.5


In [7]:
len(df_)

208824

In [8]:
#values in column 'Magnitude'
unique_values = df_['Magnitude'].unique()
unique_values

array([6.9, 7. , 6.5, 6.8, 7.5, 6.6, 7.1, 7.6, 7.3, 7.8, 6.7, 7.2, 7.4,
       8.1, 8.2, 7.7, 6.4, 8.3, 6.3, 6.1, 8. , 5.8, 5.5, 6. , 6.2, 5.9,
       5.7, 8.5, 7.9, 8.6, 5.6, 4.9, 5.2, 5.1, 4.5, 4.8, 4. , 5.4, 5. ,
       4.2, 4.6, 4.3, 4.7, 4.4, 5.3, 4.1, 3.8, 3.7, 3.6, 3.3, 3.9, 3.4,
       3.2, 3.5, 3.1, 3. , 9.1, 8.4, 0. ])

In [9]:
median_value = df_['Magnitude'].median()
median_value

4.2

### 1. binary random forest classifier

In [10]:
mask=df_['Magnitude']> 4.2

In [11]:
df_['magnitude_0_1']=mask
df_['magnitude_0_1'] = df_['magnitude_0_1'].astype(int)

In [12]:
df_.head()

Unnamed: 0,latitude,longitude,Depth,Magnitude,magnitude_0_1
0,-3.0,102.0,0.0,6.9,1
1,27.0,126.5,35.0,7.0,1
2,26.0,100.1,0.0,6.5,1
3,2.0,130.0,0.0,6.8,1
4,28.0,130.0,35.0,7.5,1


In [13]:
# Extract features (X) and target (y)
X = df_.drop('magnitude_0_1', axis=1)  # Drop the target column to get the feature matrix X
y = df_['magnitude_0_1']  # Select the target column as the target vector y

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [16]:
#Make predictions and evaluate the model
y_pred = rf_classifier.predict(X_test)

In [17]:
# Evaluate the model using appropriate evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mat)

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23735
           1       1.00      1.00      1.00     18030

    accuracy                           1.00     41765
   macro avg       1.00      1.00      1.00     41765
weighted avg       1.00      1.00      1.00     41765

Confusion Matrix:
 [[23735     0]
 [    0 18030]]


### 2. random forest regressor

In [18]:
df_regr=df.copy()

In [19]:
df_regr.head()

Unnamed: 0,year,month,day,hour,minute,second,latitude,longitude,Depth,Magnitude
0,1900,1,5,19,0,0,-3.0,102.0,0.0,6.9
1,1900,4,24,23,16,0,27.0,126.5,35.0,7.0
2,1901,2,15,0,0,0,26.0,100.1,0.0,6.5
3,1901,4,5,21,53,0,2.0,130.0,0.0,6.8
4,1901,6,24,7,2,0,28.0,130.0,35.0,7.5


In [20]:
df_regr.drop(['year', 'month', 'day', 'hour', 'minute', 'second'], axis=1, inplace=True)

In [21]:
# Extract features (X) and target (y)
X = df_regr.drop('Magnitude', axis=1)  # Drop the target column to get the feature matrix X
y = df_regr['Magnitude']  # Select the target column as the target vector y

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [23]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=3)
rf_regressor.fit(X_train, y_train)

RandomForestRegressor(random_state=3)

In [24]:
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 0.24109084542619913
R-squared: 0.2634225998064771


### Compare random forest with regression model

In [25]:
# Create and train the Linear Regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

LinearRegression()

In [26]:
# Make predictions using the Linear Regression model
linear_predictions = linear_regressor.predict(X_test)

# Evaluate the Linear Regression model
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_r2 = r2_score(y_test, linear_predictions)

print("Linear Regression - Mean Squared Error:", linear_mse)
print("Linear Regression - R-squared:", linear_r2)

Linear Regression - Mean Squared Error: 0.2922862334908557
Linear Regression - R-squared: 0.10701116172013736


Linear Regression:

* Mean Squared Error (MSE): 0.2923
 R-squared (R2): 0.1070
* Random Forest Regressor:
Mean Squared Error (MSE): 0.2411
R-squared (R2): 0.2634
* In this comparison:

The MSE for the Random Forest Regressor (0.2411) is lower than that for the Linear Regression model (0.2923). A lower MSE suggests that the Random Forest model provides more accurate predictions on average.

The R-squared (R2) for the Random Forest Regressor (0.2634) is higher than that for the Linear Regression model (0.1070). A higher R2 indicates that the Random Forest model explains more of the variance in the target variable and provides a better fit to the data.

* Based on these metrics:

The Random Forest Regressor outperforms the Linear Regression model in terms of both MSE and R2.
The Random Forest model appears to be a better choice for this regression task as it provides more accurate predictions and captures more of the variance in the earthquake magnitude.

Keep in mind that these results are specific to this dataset and the models used. When comparing models, it's important to consider other factors like model complexity, interpretability, and computational efficiency, as well as domain-specific requirements.

### 4. Random forest multiclass classifier

We will classify the magninute of earthquakes to 3 different classes: 
* Magnitude below 4.2: below median of earthquakes
* Magnitude between 4.2 and 6: low earthquake
* Magnitude above 6: strong earthquake

In [62]:
df_mult=df.copy()

In [63]:
df_mult.head()

Unnamed: 0,year,month,day,hour,minute,second,latitude,longitude,Depth,Magnitude
0,1900,1,5,19,0,0,-3.0,102.0,0.0,6.9
1,1900,4,24,23,16,0,27.0,126.5,35.0,7.0
2,1901,2,15,0,0,0,26.0,100.1,0.0,6.5
3,1901,4,5,21,53,0,2.0,130.0,0.0,6.8
4,1901,6,24,7,2,0,28.0,130.0,35.0,7.5


In [29]:
df_mult.drop(['year', 'month', 'day', 'hour', 'minute', 'second'], axis=1, inplace=True)

In [64]:
# Define a function to categorize the 3 classes
def categorize_magnitude(magnitude):
    if magnitude < 4.2:
        return 0
    elif 4.2 <= magnitude < 6:
        return 1
    else:
        return 2


In [65]:
# Apply the function to create a new column
df_mult['Magnitude_Category'] = df_mult['Magnitude'].apply(categorize_magnitude)

In [66]:
# Extract features (X) and target (y)
X = df_mult.drop(['Magnitude','Magnitude_Category'], axis=1)  # Drop the target column to get the feature matrix X
y = df_mult['Magnitude_Category']  # Select the target column as the target vector y

In [69]:
lst=(y==2)
lst.sum()

2279

In [73]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [76]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=5)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=5)

In [77]:
#Make predictions and evaluate the model
y_pred = rf_classifier.predict(X_test)

In [78]:
set(y_test)

{0, 1, 2}

In [79]:
set(y_pred)

{0, 1, 2}

In [80]:
confusion_mat = confusion_matrix(y_test, y_pred)

In [81]:
confusion_mat

array([[15251,  5384,     0],
       [ 4384, 16263,    30],
       [   35,   245,   173]], dtype=int64)

In [82]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mat)

Accuracy: 0.7586974739614509
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.74      0.76     20635
           1       0.74      0.79      0.76     20677
           2       0.85      0.38      0.53       453

    accuracy                           0.76     41765
   macro avg       0.79      0.64      0.68     41765
weighted avg       0.76      0.76      0.76     41765

Confusion Matrix:
 [[15251  5384     0]
 [ 4384 16263    30]
 [   35   245   173]]
