### Imports

In [19]:
import optuna
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier,StackingClassifier,BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, RobustScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, make_scorer,mean_squared_error
from catboost import CatBoostClassifier, Pool, cv
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')



### Read file

In [20]:
df = pd.read_csv('emails.csv')
df = pd.DataFrame(df, columns=df.columns)
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


### Data Cleaning

In [21]:
df = df.drop(columns=['Email No.'])

In [22]:
missing_values = df.isnull().sum().sum()
print(f'Total missing values: {missing_values}')

Total missing values: 0


In [None]:
# Standardize the data (mean=0, std=1)
scaler = StandardScaler()
df = scaler.fit_transform(df)


In [23]:
X = df.drop(columns=['Prediction'])
y = df['Prediction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Parameter Optimization

In [None]:
#apply optuna for logistic regression
def objective(trial):
    C = trial.suggest_loguniform('C', 1e-10, 1e10)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
    clf = LogisticRegression(C=C, max_iter=max_iter, solver=solver)
    return cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3).mean()

In [None]:
#start optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

### Models

In [24]:
# use random forrest classifer
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
#apply decision trees 
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

### Results Evaluation

In [25]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.9719806763285024,
 0.9435215946843853,
 0.9594594594594594,
 0.9514237855946398)

### Run multiple models in one go

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

models = {
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

for name, model in models.items():
    model.fit(X_train, y_train)
    accuracy, precision, recall, f1 = evaluate_model(model, X_test, y_test)
    print(f"{name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")


Naive Bayes - Accuracy: 0.9545893719806763, Precision: 0.890282131661442, Recall: 0.9594594594594594, F1-Score: 0.9235772357723576
SVM - Accuracy: 0.9594202898550724, Precision: 0.9205298013245033, Recall: 0.9391891891891891, F1-Score: 0.9297658862876255
Random Forest - Accuracy: 0.9777777777777777, Precision: 0.9595959595959596, Recall: 0.9628378378378378, F1-Score: 0.9612141652613827
XGBoost - Accuracy: 0.9797101449275363, Precision: 0.9449838187702265, Recall: 0.9864864864864865, F1-Score: 0.9652892561983472
k-NN - Accuracy: 0.8628019323671497, Precision: 0.7251461988304093, Recall: 0.8378378378378378, F1-Score: 0.7774294670846394
MLP - Accuracy: 0.9797101449275363, Precision: 0.9508196721311475, Recall: 0.9797297297297297, F1-Score: 0.9650582362728785
