# Models for the Zindi-Landslide-Competition

A description of the data can be found in the notebook EDA.

The metric for this competition is the f1-score.

My final score on the public leaderboard is 0.76586, which ranked: 56/149.
The top score is 0.79351.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, recall_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.ensemble import StackingClassifier, HistGradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

## Preprocessing

As seen in the EDA, the twi-feature is highly skewed and the aspect-feature is circular. These columns of these features have to be transformed for usage in machine-learning algorithms.

The preprocessing includes:
* log transformation of twi-feature
* cos/sin transformation of aspect-feature
* selecting relevant columns
* standard scaling

In [None]:
# load data
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

In [None]:
# log-transform of columns
def log_transform(df, column):
    for col in df.columns:
        if column in col:
            result = np.where(df[col]>1.0e-5, df[col], 1.0e-5)
            df[col+'_log'] = np.log(result)

In [None]:
# using log_transform on twi columns of train and test set
log_transform(train, 'twi')
log_transform(test, 'twi')

In [None]:
# sine/cosine of circular-columns
def angle_transform(df, column):
    for col in df.columns:
        if column in col:
            df[col+'_sin'] = np.sin(np.deg2rad(df[col]))
            df[col+'_cos'] = np.cos(np.deg2rad(df[col]))

In [None]:
# using angle_transform on aspect columns of train and test set
angle_transform(train, 'aspect')
angle_transform(test, 'aspect')

In [None]:
# select all columns of specific features including the transformed columns of twi and aspect
selected_cols = []
selected_features = ['elevation', 'slope', 'aspect_sin', 'aspect_cos',
                     'placurv', 'procurv', 'lsfactor', 'twi_log', 'geology', 'sdoif']
for col in train.columns:
    for feature in selected_features:
        if feature in col:
            selected_cols.append(col)

In [None]:
# select columns used for x (features) and y (result) in modelling
X = train[selected_cols]
y = train.Label

# Split training data into train and validation sets with similar ratio of the label
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3, random_state=2022, stratify=y)

In [None]:
# Scale feature values to similar value range
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

In [None]:
# Select columns for test set and scale feature values like in the training set
X_test = test[selected_cols]
X_test = sc.transform(X_test)

## Model results and Confusion matrix

Define functions used for presenting the results of different algorithms. 

In [None]:
def plot_confusion_matrix(true, predicted, classifier):
    """
    plot the confusion matrix for a classifier
    Input:
     true: actual values
     predicted: predicted values with the classifier
     classifier: classifier used for predictions
    """
    cm = confusion_matrix(true, predicted, labels=classifier.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)
    fig, ax = plt.subplots(figsize=(3,2))
    disp.plot(ax=ax)
    plt.show()

In [None]:
def fit_clf_and_results(x_train, x_val, y_train, y_val, classifier):
    """
    fit a classifier and print f1-score, classification report and confusion matrix
    returns y_pred, f1-score, recall-score
    Input:
     x_train: data used for training the classifier
     x_val: data used for validating the classifier
     y_train: label of training data
     y_val: label of validation data
     classifier: a classifier from sklearn
    """
    classifier.fit(x_train, y_train)

    # Make predictions
    y_pred = classifier.predict(x_val)

    # Check the auc score of the model
    f1 = f1_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    print(f'{classifier} \nF1 score on the X_val is: {f1}\n')

    # print classification report
    print(classification_report(y_val, y_pred))
    
    plot_confusion_matrix(y_val, y_pred, classifier)
    
    return y_pred, f1, recall

## Logistic regression (Baseline Model)

In [None]:
# Train model
logreg = LogisticRegression(class_weight='balanced', random_state=2022, max_iter=1000)
y_pred_logreg, f1_logreg, recall_logreg = fit_clf_and_results(X_train, X_val, y_train, y_val, logreg)

In [None]:
y_test_logreg = logreg.predict(X_test)
y_test_logreg

## K-Nearest-Neighbors

### Hyperparameter Tuning using Grid Search

Finding best parameters for the K-nearest-neighbors algorithm

In [None]:
# parameters = {'n_neighbors':[4,6,8,12,15,20], 'p':[1, 2], 'weights':('uniform','distance')}
# knn_clf = KNeighborsClassifier()
# clf = GridSearchCV(knn_clf, parameters, scoring='f1')
# clf.fit(X_train, y_train)

In [None]:
# clf.best_params_

### Applying the best parameters

In [None]:
# Train model
knn = KNeighborsClassifier(n_neighbors=12, weights='distance', p=1)
y_pred_knn, f1_knn, recall_knn = fit_clf_and_results(X_train, X_val, y_train, y_val, knn)

In [None]:
y_test_knn = knn.predict(X_test)
y_test_knn

## Support Vector Machine

### Hyperparameter Tuning using Grid Search

In [None]:
# parameters = {'kernel':('poly', 'rbf', 'sigmoid'), 'C':[0.1, 0.5, 1, 2], 'class_weight':(None,'balanced')}
# SVC_clf = SVC()
# clf = GridSearchCV(SVC_clf, parameters, scoring='f1')
# clf.fit(X_train, y_train)

In [None]:
# clf.best_params_

### Applying the best parameters

In [None]:
# Train model
svm = SVC(C=2, class_weight='balanced', random_state=2022)
y_pred_svm, f1_svm, recall_svm = fit_clf_and_results(X_train, X_val, y_train, y_val, svm)

In [None]:
y_test_svm = svm.predict(X_test)
y_test_svm

## Ensemble Methods

Combining several models to archive better results than just by using a single model. However, this makes the model more complex, more time consuming to train and less understandable.

### Histogram Gradient Boosting

The histogram gradient boosting turned out to perform better than Random Forest and normal gradient boosting.

In [None]:
# parameters = {'max_depth': (10,20,30), 'max_leaf_nodes':(10,31,50)}
# gb_clf = HistGradientBoostingClassifier()
# clf = GridSearchCV(gb_clf, parameters, scoring='f1')
# clf.fit(X_train, y_train)

In [None]:
# clf.best_params_

In [None]:
gb = HistGradientBoostingClassifier(max_leaf_nodes=50, max_depth=20, max_iter=1000, scoring='f1', random_state=2022)
y_pred_gb, f1_gb, recall_gb = fit_clf_and_results(X_train, X_val, y_train, y_val, gb)

### Stacked Estimators

The hyperparameters used here have turned out to be performing better than the ones found for the classifiers by itself.

In [None]:
estimators = [
    ('lr_bal', LogisticRegression(class_weight='balanced', random_state = 2022, max_iter=1000)),
    ('knn', KNeighborsClassifier(n_neighbors=13, weights='distance', p=1)),
    ('svm_bal', SVC(C=1, class_weight='balanced', random_state=2022)),
    ('gb', HistGradientBoostingClassifier(max_leaf_nodes=60, max_depth=20, max_iter=500, random_state=2022)),
    ]

final_est = LogisticRegression(class_weight='balanced', random_state = 42)

stack_clf = StackingClassifier(estimators=estimators, final_estimator=final_est)
y_pred_stack_clf, f1_stack_clf, recall_stack_clf = fit_clf_and_results(X_train, X_val, y_train, y_val, stack_clf)

StackingClassifier(estimators=[('lr_bal',
                                LogisticRegression(class_weight='balanced',
                                                   max_iter=1000,
                                                   random_state=2022)),
                               ('knn',
                                KNeighborsClassifier(n_neighbors=13, p=1,
                                                     weights='distance')),
                               ('svm_bal',
                                SVC(C=1, class_weight='balanced',
                                    random_state=2022)),
                               ('gb',
                                HistGradientBoostingClassifier(max_depth=20,
                                                               max_iter=500,
                                                               max_leaf_nodes=60,
                                                               random_state=2022))],
                   final_estimator=LogisticRegression(class_weight='balanced',
                                                      random_state=42)) 
F1 score on the X_val is: 0.7147505422993492

In [None]:
# ('lr_bal',
#                                 LogisticRegression(class_weight='balanced',
#                                                    max_iter=1000,
#                                                    random_state=2022)),
#                                ('knn',
#                                 KNeighborsClassifier(n_neighbors=13, p=1,
#                                                      weights='distance')),
#                                ('svm_bal',
#                                 SVC(C=2, class_weight='balanced',
#                                     random_state=2022)),
#                                ('gb',
#                                 HistGradientBoostingClassifier(max_depth=20,
#                                                                max_iter=500,
#                                                                max_leaf_nodes=60,
#                                                                random_state=2022))],

f1_stack_clf

In [None]:
# Predict on test set
y_test_stack = stack_clf.predict(X_test)
y_test_stack

In [None]:
estimators = [
    ('lr_bal', LogisticRegression(class_weight='balanced', random_state = 2022, max_iter=1000)),
    ('knn', KNeighborsClassifier(n_neighbors=15, weights='distance', p=1)),
    ('svm_bal', SVC(class_weight='balanced', random_state=2022)),
    ('gb', HistGradientBoostingClassifier(max_leaf_nodes=50, max_depth=20, max_iter=500, random_state=2022)),
    ]

final_est = LogisticRegression(class_weight='balanced', random_state = 42)

sclf = StackingClassifier(estimators= estimators , final_estimator=final_est)

params = {'knn__n_neighbors': [10,12,15],
          'svm_bal__C': [1,2]
          }

grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5, scoring='f1')
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_estimator_

In [None]:
grid_results = pd.DataFrame(grid.cv_results_)
grid_results

In [None]:
pd.set_option('display.max_colwidth', None)
grid_results[['params', 'rank_test_score']].sort_values(by='rank_test_score')

## Submission file

Creating and saving the submission file for the challenge.

In [None]:
df_y_test = pd.DataFrame(y_test_stack, columns=['Label'])
y_submission = pd.concat([test['Sample_ID'],df_y_test], axis=1)
y_submission.to_csv('data/submission_stack_maxleafnodes59.csv',index=False)

## Analysing the Results

In [None]:
classifiers = ['logreg', 'knn', 'svm', 'gb', 'stacked']
f1_scores = [f1_logreg, f1_knn, f1_svm, f1_gb, f1_stack_clf]
recall_scores = [recall_logreg, recall_knn, recall_svm, recall_gb, recall_stack_clf]

In [None]:
fig, axs = plt.subplots(1,2, figsize = (6, 4))
axs[0].bar(classifiers, f1_scores)
axs[1].bar(classifiers, recall_scores)
axs[0].set(title='f1 score', ylim=[0,1])
axs[1].set(title='recall score', ylim=[0,1])

### SVM Classifier

In [None]:
def add_result_column(val_df, y_pred, classifier):
    y_pred_df = pd.DataFrame(y_pred, columns=['Label_pred']).set_index(val_df.index)
    
    conditions = [
    val_df['Label'].eq(0) & y_pred_df['Label_pred'].eq(0),
    val_df['Label'].eq(1) & y_pred_df['Label_pred'].eq(0),
    val_df['Label'].eq(0) & y_pred_df['Label_pred'].eq(1),
    val_df['Label'].eq(1) & y_pred_df['Label_pred'].eq(1)
    ]

    choices = ['TN','FN','FP','TP']

    val_df['Result_'+classifier] = np.select(conditions, choices, default=0)
    return val_df

In [None]:
val_df = pd.concat([pd.DataFrame(X_val, columns=selected_cols).set_index(y_val.index), y_val], axis=1)

In [None]:
results_df = add_result_column(val_df, y_pred_svm, 'SVM')

In [None]:
results_df = add_result_column(results_df, y_pred_logreg, 'Logreg')

In [None]:
results_df

In [None]:
fig, axs = plt.subplots(3, 3, figsize = (13, 10))
present_df = results_df
classifier = 'SVM'
sns.histplot(present_df, x="13_elevation", hue="Result_"+classifier, ax=axs[0,0], element="step", fill=False)
sns.histplot(present_df, x="13_slope", hue="Result_"+classifier, ax=axs[0,1], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_aspect_sin", hue="Result_"+classifier, ax=axs[0,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_aspect_cos", hue="Result_"+classifier, ax=axs[0,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_placurv", hue="Result_"+classifier, ax=axs[1,0], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_procurv", hue="Result_"+classifier, ax=axs[1,1], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_lsfactor", hue="Result_"+classifier, ax=axs[1,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_twi_log", hue="Result_"+classifier, ax=axs[2,0], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_geology", hue="Result_"+classifier, ax=axs[2,1], multiple="dodge", element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_sdoif", hue="Result_"+classifier, ax=axs[2,2], element="step", fill=False, legend=False)

fig.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(3, 3, figsize = (13, 10))
present_df = results_df
classifier = 'Logreg'
sns.histplot(present_df, x="13_elevation", hue="Result_"+classifier, ax=axs[0,0], element="step", fill=False)
sns.histplot(present_df, x="13_slope", hue="Result_"+classifier, ax=axs[0,1], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_aspect_sin", hue="Result_"+classifier, ax=axs[0,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_aspect_cos", hue="Result_"+classifier, ax=axs[0,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_placurv", hue="Result_"+classifier, ax=axs[1,0], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_procurv", hue="Result_"+classifier, ax=axs[1,1], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_lsfactor", hue="Result_"+classifier, ax=axs[1,2], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_twi_log", hue="Result_"+classifier, ax=axs[2,0], element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_geology", hue="Result_"+classifier, ax=axs[2,1], multiple="dodge", element="step", fill=False, legend=False)
sns.histplot(present_df, x="13_sdoif", hue="Result_"+classifier, ax=axs[2,2], element="step", fill=False, legend=False)

fig.tight_layout()
plt.show()

In [None]:
plot_order = ["TN", "TP", "FP", "FN"]
fig, axs = plt.subplots(9,2, figsize = (10, 15))
sns.boxplot(data=present_df, x="13_elevation", y="Result_SVM", order=plot_order, ax=axs[0,0])
sns.boxplot(data=present_df, x="13_slope", y="Result_SVM", order=plot_order, ax=axs[1,0])
sns.boxplot(data=present_df, x="13_aspect_sin", y="Result_SVM", order=plot_order, ax=axs[2,0])
sns.boxplot(data=present_df, x="13_aspect_cos", y="Result_SVM", order=plot_order, ax=axs[3,0])
sns.boxplot(data=present_df, x="13_placurv", y="Result_SVM", order=plot_order, ax=axs[4,0])
sns.boxplot(data=present_df, x="13_procurv", y="Result_SVM", order=plot_order, ax=axs[5,0])
sns.boxplot(data=present_df, x="13_lsfactor", y="Result_SVM", order=plot_order, ax=axs[6,0])
sns.boxplot(data=present_df, x="13_twi_log", y="Result_SVM", order=plot_order, ax=axs[7,0])
sns.boxplot(data=present_df, x="13_sdoif", y="Result_SVM", order=plot_order, ax=axs[8,0])

sns.boxplot(data=present_df, x="13_elevation", y="Result_Logreg", order=plot_order, ax=axs[0,1])
sns.boxplot(data=present_df, x="13_slope", y="Result_Logreg", order=plot_order, ax=axs[1,1])
sns.boxplot(data=present_df, x="13_aspect_sin", y="Result_Logreg", order=plot_order, ax=axs[2,1])
sns.boxplot(data=present_df, x="13_aspect_cos", y="Result_Logreg", order=plot_order, ax=axs[3,1])
sns.boxplot(data=present_df, x="13_placurv", y="Result_Logreg", order=plot_order, ax=axs[4,1])
sns.boxplot(data=present_df, x="13_procurv", y="Result_Logreg", order=plot_order, ax=axs[5,1])
sns.boxplot(data=present_df, x="13_lsfactor", y="Result_Logreg", order=plot_order, ax=axs[6,1])
sns.boxplot(data=present_df, x="13_twi_log", y="Result_Logreg", order=plot_order, ax=axs[7,1])
sns.boxplot(data=present_df, x="13_sdoif", y="Result_Logreg", order=plot_order, ax=axs[8,1])

fig.tight_layout()
plt.show()

In [None]:
markers = {'TN': '.', 'TP': '.', 'FP': '<', 'FN': '>'}

In [None]:
sns.scatterplot(x='13_elevation', y='13_slope', data=results_svm_full, hue='Result', 
                style='Result', markers=markers);

In [None]:
sns.scatterplot(x='13_twi_log', y='13_lsfactor', data=results_svm_full, hue='Result', 
                style='Result', markers=markers);

In [None]:
sns.scatterplot(x='13_placurv', y='13_sdoif', data=results_svm_full, hue='Result', 
                style='Result', markers=markers);