In [None]:
import time
import warnings
import pickle
import shap

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb # recommended version: 1.5.0

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, RepeatedKFold
from xgboost import plot_importance

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

In [None]:
# Reading of Excel
train_metadata = pd.read_csv('train_dataset.csv')
test_metadata = pd.read_csv('test_metadata.csv')

In [None]:
train_metadata

In [None]:
test_metadata

In [None]:
x_train = train_metadata.drop(columns = "isBot")
x_test = test_metadata.drop(columns = "isBot")
y_train = train_metadata["isBot"]
y_test = test_metadata["isBot"]

# Models
1. Logistic Regression
2. Support Vector Classifier
3. Random Forest Classifier
4. Adaboost Classifer
5. Gradient Boosting Classifier

## 01 Logistic Regression
* Model 
* Error Metrics

In [None]:
# Model
log_regression = LogisticRegression()

In [None]:
%%time

# Fit Model
log_model = log_regression.fit(x_train, y_train)

In [None]:
%%time

# y_prediction
y_pred = log_model.predict(x_test)

In [None]:
# Error Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred)
print(f'Classification Report: \n {report}')

In [None]:
# Confusion Matrix Plot
metrics.plot_confusion_matrix(log_model, x_test, y_test, cmap = "Blues")
plt.show()

## Feature Importance

In [None]:
# Feature importance
feature = pd.DataFrame(log_model.coef_.T, log_model.columns, columns = ['Feature Importance']).sort_values(by = 'Feature Importance', ascending=False)
feature

In [None]:
# Feature importance
fig = feature.plot(kind="barh", figsize=(12, 15))
bg = fig.patch
bg.set_facecolor("white")
plt.gca().invert_yaxis()
plt.grid()
plt.show()