In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from scipy.stats import randint, uniform

In [3]:
def metrics(y, x):
    print(f'Accuracy: {accuracy_score(y, x)}')
    print(f'Precision: {precision_score(y, x)}')
    print(f'Recall: {recall_score(y, x)}')
    print(f'F1 Score: {f1_score(y, x)}')
    print(f'ROC AUC: {roc_auc_score(y, x)}')

In [4]:
df = pd.read_csv(r"C:\Users\trush\OneDrive\Documents\WFU Grad School Info\BAN 6025 Machine Learning\Sky's the Limit\BAN6025Project2Data.csv")
df.head()

Unnamed: 0,Target_Y,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,...,0,200,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200,0,INTERNET,3.743048,other,0,1,0,0


In [5]:
df = pd.get_dummies(df, columns=['X8', 'X18', 'X15', 'X25', 'X27'], drop_first=True, dtype=float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 48 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   Target_Y       1000000 non-null  int64  
 1   X1             1000000 non-null  float64
 2   X2             1000000 non-null  float64
 3   X3             1000000 non-null  int64  
 4   X4             1000000 non-null  int64  
 5   X5             1000000 non-null  int64  
 6   X6             1000000 non-null  float64
 7   X7             1000000 non-null  float64
 8   X9             1000000 non-null  int64  
 9   X10            1000000 non-null  float64
 10  X11            1000000 non-null  float64
 11  X12            1000000 non-null  float64
 12  X13            1000000 non-null  int64  
 13  X14            1000000 non-null  int64  
 14  X16            1000000 non-null  int64  
 15  X17            1000000 non-null  int64  
 16  X19            1000000 non-null  int64  
 17  X20      

In [6]:
sample_size = 0.05

sample_df = df.groupby('Target_Y', group_keys=False).apply(lambda x: x.sample(frac=sample_size, random_state=69))

sample_df.head()

  sample_df = df.groupby('Target_Y', group_keys=False).apply(lambda x: x.sample(frac=sample_size, random_state=69))


Unnamed: 0,Target_Y,X1,X2,X3,X4,X5,X6,X7,X9,X10,...,X15_CC,X15_CD,X15_CE,X15_CF,X15_CG,X25_TELEAPP,X27_macintosh,X27_other,X27_windows,X27_x11
434757,0,0.1,0.305352,-1,116,40,0.019318,-1.187689,3447,11124.876879,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
334757,0,0.2,0.998933,29,5,20,17.990344,-1.101194,4507,11228.317603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
895206,0,0.6,0.249452,31,16,40,0.010178,9.409894,1291,5087.989767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
723405,0,0.3,0.14254,-1,118,40,0.00796,49.521697,2353,4698.312868,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
133474,0,0.4,0.397762,-1,75,40,0.008105,27.252734,846,10738.552406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
df['Target_Y'].value_counts()

Target_Y
0    988971
1     11029
Name: count, dtype: int64

In [8]:
X = sample_df.drop(columns=['Target_Y'])
y = sample_df['Target_Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=69)

smote = SMOTE(sampling_strategy='auto', random_state=69)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [41]:
rf_tune = RandomForestClassifier()

param_dist = {
    'n_estimators': randint(1, 500),
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': randint(1, 30), 
    'min_samples_leaf': randint(1, 100)
}

random_search = RandomizedSearchCV(rf_tune, param_dist, n_iter=50, cv=5, scoring='f1', random_state=69, n_jobs=-1)

random_search.fit(X_train_resampled, y_train_resampled)

best_parameters = random_search.best_params_

In [42]:
best_rf = RandomForestClassifier(**best_parameters)
best_rf.fit(X_train_resampled, y_train_resampled)

In [58]:
y_pred_proba = best_rf.predict_proba(X_test)
y_pred_test = (y_pred_proba[:, 1] > 0.09).astype(int)

In [59]:
y_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [60]:
print('---Test---')
metrics(y_test, y_pred_test)

---Test---
Accuracy: 0.8673333333333333
Precision: 0.04534130543099153
Recall: 0.5515151515151515
F1 Score: 0.08379373848987108
ROC AUC: 0.7111805619389037
