In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleanformodels.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)  # You can tune this
X_text = tfidf.fit_transform(df['text'])

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical = df[['sender_username', 'sender_domain']]
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
X_cat = encoder.fit_transform(categorical)

In [5]:
numerical = df[['urls', 'hour', 'day_of_week']]
scaler = StandardScaler()
X_num = scaler.fit_transform(numerical)

In [6]:
from scipy.sparse import hstack

X = hstack([X_text, X_cat, X_num])
y = df['label']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', C=1)  # You can try 'rbf' kernel too
svm_model.fit(X_train, y_train)

In [9]:
from sklearn.metrics import classification_report

y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3489
           1       1.00      1.00      1.00      4144

    accuracy                           1.00      7633
   macro avg       1.00      1.00      1.00      7633
weighted avg       1.00      1.00      1.00      7633



In [10]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred, labels=svm_model.classes_))

[[3479   10]
 [   1 4143]]


In [11]:
import pickle

# Save SVM model
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save OneHotEncoder
with open('onehot_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

# Save StandardScaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### testing

In [14]:
sample_data = pd.DataFrame([{
    'sender_username': 'support',
    'sender_domain': 'suspicious.com',
    'day_of_week': 1,
    'urls': 1,
    'hour': 14,
    'text': "Please click this link to verify your account."
}])

# 3. Preprocess features
X_text = tfidf.transform(sample_data['text'])

X_cat = encoder.transform(sample_data[['sender_username', 'sender_domain']])

X_num = scaler.transform(sample_data[['urls', 'hour', 'day_of_week']])

# 4. Combine features
X_final = hstack([X_text, X_cat, X_num])

# 5. Predict
prediction = svm_model.predict(X_final)
label = 'Phishing' if prediction[0] == 1 else 'Legit'

print(f"Predicted label: {label}")


Predicted label: Phishing
