In [None]:
# pip install scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

# pip install tensorflow
import tensorflow as tf
import keras
from keras import layers

In [None]:
df = pd.read_csv('database.csv')
df.head()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

# I will drop the columns that have more than 30% of missing values

In [None]:
df = df.drop(['Depth Error', 'Depth Seismic Stations', 'Magnitude Error', 'Magnitude Seismic Stations', 'Azimuthal Gap', 'Horizontal Distance', 'Horizontal Error', 'Root Mean Square'], axis=1)

In [None]:
df.isna().sum()

In [None]:
df  = df.dropna(subset=['Magnitude Type'])

# removed the rows with missing values in the column Magnitude Type

df.isna().sum()

In [None]:
df.head()

In [None]:
df['Latitude'].hist()

# Latitude is a normal distribution

In [None]:
values = df['Source'].unique()
values

value_counts = df['Source'].value_counts()
value_counts

df['Source'] = df['Source'].replace({'US': 1, 'ISCGEM': 0})

df['Source'] = pd.to_numeric(df['Source'], errors='coerce')

df = df.dropna(subset=['Source'])

In [None]:
values = df['Location Source'].unique()
values

value_counts = df['Location Source'].value_counts()
value_counts

df['Location Source'] = df['Location Source'].replace({'US': 1, 'ISCGEM': 0})

df['Location Source'] = pd.to_numeric(df['Location Source'], errors='coerce')

df = df.dropna(subset=['Location Source'])

In [None]:
values = df['Status'].unique()
values

value_counts = df['Status'].value_counts()
value_counts

df['Status'] = df['Status'].replace({'Automatic': 1, 'Reviewed': 0})

In [None]:
values_source = df['ID'].unique()
len(values_source)
df = df.drop(['ID'], axis=1)

# No need to keep the ID column

In [None]:
values = df['Magnitude Type'].unique()
df = df.drop(['Magnitude Type'], axis=1)
df = df.drop(['Magnitude Source'], axis=1)

In [None]:
df.head()

In [None]:
values = df['Type'].unique()
values

value_counts = df['Type'].value_counts()
value_counts

df['Type'] = df['Type'].replace({'Earthquake': 1, 'Nuclear Explosion': 0})

df['Type'] = pd.to_numeric(df['Type'], errors='coerce')

df = df.dropna(subset=['Type'])

In [None]:
df = df.drop(['Date', 'Time'], axis=1)

In [None]:
numeric_data = df.apply(pd.to_numeric, errors='coerce')

# Check for NaN values
non_numeric_values = numeric_data.isna().any()
non_numeric_values

In [None]:
df.corr().round(2)

In [None]:
from sklearn.preprocessing import LabelEncoder

 # everything else except the target variable
X = df.drop('Latitude', axis=1)

# have only the target variable here (dependent variable)
y_temp = df['Latitude']

# since we are doing classification, we have to process our target values with an encoder
# and convert them into a categorical TensorFlow/Keras -format 
le = LabelEncoder()
y_enc = le.fit_transform(y_temp)

# Converting the label into a matrix form
y = tf.keras.utils.to_categorical(y_enc)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# convert all continuous variables to integer,
# and convert all negative numbers to 0
X_cat = X.astype(int)
X_cat = X_cat.clip(lower=0)

# initialize chi2 and SelectKBest
# Note: chi2 -test is a very common test
# in statistics and quantitative analysis
# basically it studies the data whether variables are related
# or independent of each other
chi_2_features = SelectKBest(chi2, k=len(X_cat.columns))

# fit our data to the SelectKBest
best_features = chi_2_features.fit(X_cat,y.astype(int))

# use decimal format in table print later
pd.options.display.float_format = '{:.2f}'.format

# wrap it up, and show the results
# the higher the score, the more effect that column has on price
df_features = pd.DataFrame(best_features.scores_)
df_columns = pd.DataFrame(X_cat.columns)
f_scores = pd.concat([df_columns,df_features],axis=1)
f_scores.columns = ['Features','Score']
f_scores.sort_values(by='Score',ascending=False)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

In [None]:
# save the categories into a list
categories = list(np.unique(df['Latitude']))
print(categories)

In [None]:
model = keras.Sequential(
    [
        layers.BatchNormalization(input_shape=(len(X.columns),)),
        layers.Dense(16, activation="relu", kernel_regularizer=keras.regularizers.l1(l=0.1)),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(8, activation="relu"),
        layers.Dense(len(categories), activation="softmax")
    ]
)


# compile the model, this time we use categorical crossentropy for loss -function
# and we also measure the accuracy of our model in the metrics
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(x=X_train, y=y_train, epochs=500, validation_data=(X_val, y_val))

In [None]:
loss_df = pd.DataFrame(model.history.history)
loss_df[['loss', 'val_loss']].plot()

In [None]:
loss_df[['accuracy', 'val_accuracy']].plot()

In [None]:
print("Test data evaluation:")
print(model.evaluate(X_test, y_test, verbose=0))
print("\nTrain data evaluation:")
print(model.evaluate(X_train, y_train, verbose=0))

In [None]:
test_predictions = model.predict(X_test)
test_predictions = np.argmax(test_predictions, axis=1)

# convert also y-test -values with argmax
y_test = np.argmax(y_test, axis=1)

In [None]:
from sklearn.metrics import confusion_matrix
 # the original heatmap without label names
# sns.heatmap(confusion_matrix(y_test, test_predictions), annot=True, fmt='g')

# you can also use the actual names for the categories
sns.heatmap(confusion_matrix(y_test, test_predictions), xticklabels=categories, yticklabels=categories, annot=True, fmt='g')

In [None]:
print(classification_report(y_test, test_predictions, target_names=categories))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, test_predictions)
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))

In [None]:
roc_auc_score(y, model.predict(X), multi_class="ovr")

In [None]:
df.columns

In [None]:
tester_row = {
    'Longitude': 145,
    'Type': 1,
    'Depth': 130,
    'Magnitude': 6, 
    'Source': 0,
    'Location Source': 0, 
    'Status': 1
}

# convert to pandas-format
tester_row = pd.DataFrame([tester_row])

In [None]:
result = model.predict(tester_row)[0]

print()
print(f"Estimated Latitude:")
print(f"{round(float(result), 2)}")
print("----------------")