In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report,confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Preprocessing

* log transformation of twi-feature
* selecting relevant columns
* standard scaling

In [None]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

In [None]:
# log-transform of twi columns
def log_transform_twi(df):
    for col in df.columns:
        if 'twi' in col:
            df[col+'_log'] = np.log(df[col])

In [None]:
# using log_transform_twi
log_transform_twi(train)
log_transform_twi(test)

In [None]:
# select all columns of specific features
selected_cols = []
selected_features = ['elevation', 'slope', 'aspect', 'placurv', 'procurv', 'twi_log', 'geology', 'sdoif']
for col in train.columns:
    for feature in selected_features:
        if feature in col:
            selected_cols.append(col)

In [None]:
# select columns used for x and y
X = train[selected_cols]
y = train.Label

# Split data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3, random_state=2022, stratify=y)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

In [None]:
X_test = test[selected_cols]
X_test = sc.transform(X_test)

## Logistic regression

In [None]:
# Train model
logreg = LogisticRegression(random_state = 2022, max_iter=1000)
logreg.fit(X_train, y_train)

# Make predictions
y_pred_logreg = logreg.predict(X_val)

# Check the auc score of the model
print(f'Logistic Regression F1 score on the X_test is: {f1_score(y_val, y_pred_logreg)}\n')

# print classification report
print(classification_report(y_val, y_pred_logreg))

In [None]:
# Confusion matrix
cm_logreg = confusion_matrix(y_val, y_pred_logreg, labels=logreg.classes_)
disp_logreg = ConfusionMatrixDisplay(confusion_matrix=cm_logreg, display_labels=logreg.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_logreg.plot(ax=ax)
plt.show()

In [None]:
y_pred_proba_logreg = logreg.predict_proba(X_val)
y_val_array = y_val.to_numpy()
fpr, tpr, _ = roc_curve(y_val_array,  y_pred_proba_logreg[:,1])
auc = roc_auc_score(y_val_array, y_pred_proba_logreg[:,1])
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
y_test_logreg = logreg.predict(X_test)
y_test_logreg

## K-Nearest-Neighbours

In [None]:
# Train model
knn = KNeighborsClassifier(n_neighbors=15, weights='distance', p=1)
knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn.predict(X_val)

# Check the auc score of the model
print(f'K-Nearest-Neighbor F1 score on the X_test is: {f1_score(y_val, y_pred_knn)}\n')

# print classification report
print(classification_report(y_val, y_pred_knn))

In [None]:
# Confusion matrix
cm_knn = confusion_matrix(y_val, y_pred_knn, labels=knn.classes_)
disp_knn = ConfusionMatrixDisplay(confusion_matrix=cm_knn, display_labels=knn.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_knn.plot(ax=ax)
plt.show()

In [None]:
y_test_knn = knn.predict(X_test)
y_test_knn

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'n_neighbors':[4,6,8,12,15,20], 'p':[1, 2], 'weights':('uniform','distance')}
knn_clf = KNeighborsClassifier()
clf = GridSearchCV(knn_clf, parameters)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.cv_results_

## Support Vector Machine

In [None]:
# Train model
svm = SVC(class_weight='balanced', random_state=2022)
svm.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm.predict(X_val)

# Check the auc score of the model
print(f'Support Vector Machine F1 score on the X_test is: {f1_score(y_val, y_pred_svm)}\n')

# print classification report
print(classification_report(y_val, y_pred_svm))

In [None]:
# Confusion matrix
cm_svm = confusion_matrix(y_val, y_pred_svm, labels=svm.classes_)
disp_svm = ConfusionMatrixDisplay(confusion_matrix=cm_svm, display_labels=svm.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_svm.plot(ax=ax)
plt.show()
#155
#453

In [None]:
parameters = {'kernel':('poly', 'rbf', 'sigmoid'), 'C':[0.1, 0.5, 1, 2], 'class_weight':(None,'balanced')}
SVC_clf = SVC()
clf = GridSearchCV(SVC_clf, parameters)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

## Decision Tree

In [None]:
# Train model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Make predictions
y_pred_dtree = dtree.predict(X_val)

# Check the auc score of the model
print(f'Decision Tree F1 score on the X_test is: {f1_score(y_val, y_pred_dtree)}\n')

# print classification report
print(classification_report(y_val, y_pred_dtree))

In [None]:
# Confusion matrix
cm_dtree = confusion_matrix(y_val, y_pred_dtree, labels=dtree.classes_)
disp_dtree = ConfusionMatrixDisplay(confusion_matrix=cm_dtree, display_labels=dtree.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_dtree.plot(ax=ax)
plt.show()

## Ensembles

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier, VotingClassifier

In [None]:
# Train model
tree_clf = DecisionTreeClassifier(max_depth=5)
ada = AdaBoostClassifier(tree_clf, n_estimators=100)
ada.fit(X_train, y_train)

# Make predictions
y_pred_ada = ada.predict(X_val)

# Check the auc score of the model
print(f'AdaBoost F1 score on the X_test is: {f1_score(y_val, y_pred_ada)}\n')

# print classification report
print(classification_report(y_val, y_pred_ada))

In [None]:
# Train model
rf = RandomForestClassifier(n_estimators=200, max_depth=10)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_val)

# Check the auc score of the model
print(f'Random Forest F1 score on the X_test is: {f1_score(y_val, y_pred_rf)}\n')

# print classification report
print(classification_report(y_val, y_pred_rf))

In [None]:
# Confusion matrix
cm_rf = confusion_matrix(y_val, y_pred_rf, labels=rf.classes_)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rf.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_rf.plot(ax=ax)
plt.show()

In [None]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2022)),
    ('lr', LogisticRegression(random_state = 2022, max_iter=1000)),
    ('knn', KNeighborsClassifier(n_neighbors=15, weights='distance', p=1)),
    ('svm', SVC(class_weight='balanced', random_state=2022))
    ]

stack_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack_clf.fit(X_train, y_train)

# Make predictions
y_pred_stack_clf = stack_clf.predict(X_val)

# Check the auc score of the model
print(f'Stacking F1 score on the X_test is: {f1_score(y_val, y_pred_stack_clf)}\n')

# print classification report
print(classification_report(y_val, y_pred_stack_clf))

In [None]:
# Confusion matrix
cm_stack_clf = confusion_matrix(y_val, y_pred_stack_clf, labels=stack_clf.classes_)
disp_stack_clf = ConfusionMatrixDisplay(confusion_matrix=cm_stack_clf, display_labels=stack_clf.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_stack_clf.plot(ax=ax)
plt.show()

In [None]:
voting_clf = VotingClassifier(estimators=estimators)
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred_voting_clf = voting_clf.predict(X_val)

# Check the auc score of the model
print(f'Voting F1 score on the X_test is: {f1_score(y_val, y_pred_voting_clf)}\n')

# print classification report
print(classification_report(y_val, y_pred_voting_clf))

In [None]:
# Confusion matrix
cm_voting_clf = confusion_matrix(y_val, y_pred_voting_clf, labels=voting_clf.classes_)
disp_voting_clf = ConfusionMatrixDisplay(confusion_matrix=cm_voting_clf, display_labels=voting_clf.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_voting_clf.plot(ax=ax)
plt.show()

# Confusion matrix
cm_stack_clf = confusion_matrix(y_val, y_pred_stack_clf, labels=stack_clf.classes_)
disp_stack_clf = ConfusionMatrixDisplay(confusion_matrix=cm_stack_clf, display_labels=stack_clf.classes_)
fig, ax = plt.subplots(figsize=(3,2))
disp_stack_clf.plot(ax=ax)
plt.show()## Submission file

In [None]:
df_y_test = pd.DataFrame(y_test_logreg, columns=['Label'])
y_submission = pd.concat([test['Sample_ID'],df_y_test], axis=1)
y_submission.to_csv('data/submission_logreg.csv',index=False)