In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report,confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, StackingClassifier

from sklearn.model_selection import GridSearchCV

## Preprocessing

* log transformation of twi-feature
* cos/sin transformation of asprect feature
* selecting relevant columns
* standard scaling

In [None]:
# load data
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

In [None]:
# log-transform of columns
def log_transform(df, column):
    for col in df.columns:
        if column in col:
            result = np.where(df[col]>1.0e-5, df[col], 1.0e-5)
            df[col+'_log'] = np.log(result)

In [None]:
# using log_transform on twi columns of train and test set
log_transform(train, 'twi')
log_transform(test, 'twi')

In [None]:
# sine/cosine of angle-columns
def angle_transform(df, column):
    for col in df.columns:
        if column in col:
            df[col+'_sin'] = np.sin(np.deg2rad(df[col]))
            df[col+'_cos'] = np.cos(np.deg2rad(df[col]))

In [None]:
# using angle_transform on aspect columns of train and test set
angle_transform(train, 'aspect')
angle_transform(test, 'aspect')

In [None]:
# select all columns of specific features
selected_cols = []
selected_features = ['elevation', 'slope', 'aspect_sin', 'aspect_cos',
                     'placurv', 'procurv', 'lsfactor', 'twi_log', 'geology', 'sdoif']
for col in train.columns:
    for feature in selected_features:
        if feature in col:
            selected_cols.append(col)

In [None]:
# select columns used for x and y
X = train[selected_cols]
y = train.Label

# Split data into train and validation sets with similar ratio of the label
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3, random_state=2022, stratify=y)

In [None]:
# Scale feature values to similar value range
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

In [None]:
# Select columns for test set and scale feature values like in the training set
X_test = test[selected_cols]
X_test = sc.transform(X_test)

## Confusion matrix

In [None]:
def plot_confusion_matrix(true, predicted, classifier):
    """
    plot the confusion matrix for a classifier
    Input:
     true: actual values
     predicted: predicted values with the classifier
     classifier: classifier used for predictions
    """
    cm = confusion_matrix(true, predicted, labels=classifier.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)
    fig, ax = plt.subplots(figsize=(3,2))
    disp.plot(ax=ax)
    plt.show()

In [None]:
def fit_clf_and_results(x_train, x_val, y_train, y_val, classifier):
    """
    fit a classifier and print f1-score, classification report and confusion matrix
    returns y_pred
    Input:
     x_train: data used for training the classifier
     x_val: data used for validating the classifier
     y_train: label of training data
     y_val: label of validation data
     classifier: a classifier from sklearn
    """
    classifier.fit(x_train, y_train)

    # Make predictions
    y_pred = classifier.predict(x_val)

    # Check the auc score of the model
    print(f'{classifier} \nF1 score on the X_val is: {f1_score(y_val, y_pred)}\n')

    # print classification report
    print(classification_report(y_val, y_pred))
    
    plot_confusion_matrix(y_val, y_pred, classifier)
    
    return y_pred

In [None]:
log_test = LogisticRegression()
y_pred_log_test = fit_clf_and_results(X_train, X_val, y_train, y_val, log_test)

In [None]:
y_pred_log_test

In [None]:
y_test_logreg = log_test.predict(X_test)
y_test_logreg

## Logistic regression (Baseline Model)

In [None]:
# Train model
logreg = LogisticRegression(class_weight='balanced', random_state=2022, max_iter=1000)
y_pred_logreg = fit_clf_and_results(X_train, X_val, y_train, y_val, logreg)

In [None]:
y_test_logreg = logreg.predict(X_test)
y_test_logreg

## K-Nearest-Neighbors

In [None]:
# Train model
knn = KNeighborsClassifier(n_neighbors=15, weights='distance', p=1)
knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn.predict(X_val)

# Check the auc score of the model
print(f'K-Nearest-Neighbor F1 score on the X_test is: {f1_score(y_val, y_pred_knn)}\n')

# print classification report
print(classification_report(y_val, y_pred_knn))

plot_confusion_matrix(y_val, y_pred_knn, knn)

In [None]:
y_test_knn = knn.predict(X_test)
y_test_knn

### Hyperparameter Tuning using Grid Search

In [None]:
parameters = {'n_neighbors':[4,6,8,12,15,20], 'p':[1, 2], 'weights':('uniform','distance')}
knn_clf = KNeighborsClassifier()
clf = GridSearchCV(knn_clf, parameters)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.cv_results_

## Support Vector Machine

In [None]:
# Train model
svm = SVC(class_weight='balanced', random_state=2022)
svm.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm.predict(X_val)

# Check the auc score of the model
print(f'Support Vector Machine F1 score on the X_test is: {f1_score(y_val, y_pred_svm)}\n')

# print classification report
print(classification_report(y_val, y_pred_svm))

plot_confusion_matrix(y_val, y_pred_svm, svm)

In [None]:
y_test_svm = svm.predict(X_test)
y_test_svm

### Hyperparameter Tuning using Grid Search

In [None]:
parameters = {'kernel':('poly', 'rbf', 'sigmoid'), 'C':[0.1, 0.5, 1, 2], 'class_weight':(None,'balanced')}
SVC_clf = SVC()
clf = GridSearchCV(SVC_clf, parameters)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

### SVM with tuned Threshold by saheedniyi

In [None]:
# Train model
svm_proba = SVC(class_weight='balanced', probability=True, random_state=2022)
svm_proba.fit(X_train, y_train)

y_pred_proba_svm = pd.Series(svm_proba.predict_proba(X_val)[:,1])

In [None]:
def tune_threshold(proba,threshold):
    if proba>threshold:
        return 1
    else:
        return 0

In [None]:
threshold_range=np.arange(0,1,0.01)
f1_scores=[]
for i in threshold_range:
    tuned_predictions=y_pred_proba_svm.apply(tune_threshold,args=(i,))
    f1_scores.append(f1_score(tuned_predictions,y_val))

In [None]:
plt.plot(threshold_range,f1_scores)

In [None]:
threshold_range=np.arange(0.35,0.45,0.001)
f1_scores=[]
for i in threshold_range:
    tuned_predictions=y_pred_proba_svm.apply(tune_threshold,args=(i,))
    f1_scores.append(f1_score(tuned_predictions,y_val))

In [None]:
plt.plot(threshold_range,f1_scores)

In [None]:
max_f1 = max(f1_scores)
for t, f1 in zip(threshold_range, f1_scores):
    if f1 == max_f1:
        print(t, f1)

In [None]:
best_threshold = 0.385

In [None]:
plot_confusion_matrix(y_val, y_pred_proba_svm.apply(tune_threshold,args=(best_threshold,)), svm_proba)

In [None]:
sub=pd.DataFrame({"Sample_ID":test['Sample_ID'],"Label":svm_proba.predict_proba(X_test)[:,1]}).set_index("Sample_ID")
sub["Label"]=sub["Label"].apply(tune_threshold,args=(best_threshold,))

sub.to_csv("data/submission_svm_tunedThreshold.csv")

## Ensemble Methods

### Random Forest

In [None]:
# Train model
rf = RandomForestClassifier(n_estimators=200, max_depth=10)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_val)

# Check the auc score of the model
print(f'Random Forest F1 score on the X_test is: {f1_score(y_val, y_pred_rf)}\n')

# print classification report
print(classification_report(y_val, y_pred_rf))

# Plot confusion matrix
plot_confusion_matrix(y_val, y_pred_rf, rf)

### Stacked Estimators

In [None]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2022)),
    ('lr_bal', LogisticRegression(class_weight='balanced', random_state = 2022, max_iter=1000)),
    #('lr_unbal', LogisticRegression(random_state = 42, max_iter=1000)),
    ('knn', KNeighborsClassifier(n_neighbors=15, weights='distance', p=1)),
    ('svm_bal', SVC(class_weight='balanced', random_state=2022)),
    #('svm_unbal', SVC(random_state=42)),
    ]

final_est = LogisticRegression(class_weight='balanced', random_state = 42)

stack_clf = StackingClassifier(estimators=estimators, final_estimator=final_est)
stack_clf.fit(X_train, y_train)

# Make predictions
y_pred_stack_clf = stack_clf.predict(X_val)

# Check the auc score of the model
print(f'Stacking F1 score on the X_test is: {f1_score(y_val, y_pred_stack_clf)}\n')

# print classification report
print(classification_report(y_val, y_pred_stack_clf))

# Plot confusion matrix
plot_confusion_matrix(y_val, y_pred_stack_clf, stack_clf)

In [None]:
# Predict on test set
y_test_stack = stack_clf.predict(X_test)
y_test_stack

## Submission file

In [None]:
df_y_test = pd.DataFrame(y_test_stack, columns=['Label'])
y_submission = pd.concat([test['Sample_ID'],df_y_test], axis=1)
y_submission.to_csv('data/submission_stack_rf-knn-lr-bal-svm-bal.csv',index=False)

## Analysing the Results of SVM

In [None]:
results_svm = pd.DataFrame(y_pred_svm, columns=['Label_pred']).set_index(y_val.index)

In [None]:
X_val_df = pd.DataFrame(X_val, columns=selected_cols).set_index(y_val.index)

In [None]:
results_svm_full = pd.concat([X_val_df, y_val, results_svm], axis=1)

In [None]:
conditions = [
    results_svm_full['Label'].eq(0) & results_svm_full['Label_pred'].eq(0),
    results_svm_full['Label'].eq(1) & results_svm_full['Label_pred'].eq(0),
    results_svm_full['Label'].eq(0) & results_svm_full['Label_pred'].eq(1),
    results_svm_full['Label'].eq(1) & results_svm_full['Label_pred'].eq(1)
]

choices = ['TN','FN','FP','TP']

results_svm_full['Result'] = np.select(conditions, choices, default=0)

In [None]:
import seaborn as sns

In [None]:
fig, axs = plt.subplots(3, 3, figsize = (13, 10))
present_df = results_svm_full
sns.histplot(present_df, x="1_elevation", hue="Result", ax=axs[0,0], element="step", fill=False)
sns.histplot(present_df, x="1_slope", hue="Result", ax=axs[0,1], element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_aspect_sin", hue="Result", ax=axs[0,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_aspect_cos", hue="Result", ax=axs[0,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_placurv", hue="Result", ax=axs[1,0], element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_procurv", hue="Result", ax=axs[1,1], element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_lsfactor", hue="Result", ax=axs[1,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_twi_log", hue="Result", ax=axs[2,0], element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_geology", hue="Result", ax=axs[2,1], multiple="dodge", element="step", fill=False, legend=False)
sns.histplot(present_df, x="1_sdoif", hue="Result", ax=axs[2,2], element="step", fill=False, legend=False)

fig.tight_layout()
plt.show()

In [None]:
markers = {'TN': '.', 'TP': '.', 'FP': '<', 'FN': '>'}

In [None]:
sns.scatterplot(x='13_elevation', y='13_slope', data=results_svm_full, hue='Result', 
                style='Result', markers=markers);

In [None]:
sns.scatterplot(x='13_twi_log', y='13_lsfactor', data=results_svm_full, hue='Result', 
                style='Result', markers=markers);

In [None]:
sns.scatterplot(x='13_placurv', y='13_sdoif', data=results_svm_full, hue='Result', 
                style='Result', markers=markers);