In [1]:
import pandas as pd
import pickle
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

with open('onehot_encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [4]:
df = pd.read_csv('cleanformodels.csv')

In [5]:
X_text = tfidf.transform(df['text'])

# Categorical (One-hot)
cat_features = ['sender_username', 'sender_domain']
X_cat = encoder.transform(df[cat_features])

# Numeric (url, hour, day_of_the_week)
num_features = ['urls', 'hour', 'day_of_week']
X_num = scaler.transform(df[num_features])

In [6]:
X = hstack([X_text, X_cat, X_num])
y = df['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# --- 5. Evaluate ---
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred, labels=mlp.classes_))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3489
           1       1.00      1.00      1.00      4144

    accuracy                           1.00      7633
   macro avg       1.00      1.00      1.00      7633
weighted avg       1.00      1.00      1.00      7633

[[3479   10]
 [   3 4141]]


In [9]:
with open('mlp_model.pkl', 'wb') as f:
    pickle.dump(mlp, f)