In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
dataset = pd.read_csv('/Users/sachinkarthikeya/Desktop/Projects/ARPA-LLM/Phishing_messages.csv') 

In [3]:
dataset.head()

Unnamed: 0,TEXT,LABEL
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,0
1,What's up? Do you want me to come online? If y...,0
2,So u workin overtime nigpun?,0
3,"Also sir, i sent you an email about how to log...",0
4,Please Stay At Home. To encourage the notion o...,2


In [4]:
dataset.isnull().sum()

TEXT     0
LABEL    0
dtype: int64

In [5]:
dataset.shape

(5926, 2)

In [6]:
X = dataset['TEXT']
y = dataset['LABEL']

In [7]:
print(X)

0       Your opinion about me? 1. Over 2. Jada 3. Kusr...
1       What's up? Do you want me to come online? If y...
2                            So u workin overtime nigpun?
3       Also sir, i sent you an email about how to log...
4       Please Stay At Home. To encourage the notion o...
                              ...                        
5921                             :( but your not here....
5922    Becoz its  &lt;#&gt;  jan whn al the post ofic...
5923    Its a valentine game. . . send dis msg to all ...
5924                                We r outside already.
5925    The Xmas story is peace.. The Xmas msg is love...
Name: TEXT, Length: 5926, dtype: object


In [8]:
print(y)

0       0
1       0
2       0
3       0
4       2
       ..
5921    0
5922    0
5923    0
5924    0
5925    0
Name: LABEL, Length: 5926, dtype: int64


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
feature_extraction = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_transformed = feature_extraction.fit_transform(X_train)
X_test_transformed = feature_extraction.transform(X_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [11]:
model = SVC(kernel='linear', decision_function_shape='ovr')
model.fit(X_train_transformed, y_train)

In [12]:
training_prediction = model.predict(X_train_transformed)
training_accuracy = accuracy_score(y_train, training_prediction)
print(f'Training Accuracy: {training_accuracy}')

Training Accuracy: 0.9848101265822785


In [13]:
testing_prediction = model.predict(X_test_transformed)
testing_accuracy = accuracy_score(y_test, testing_prediction)
print(f'Testing Accuracy: {testing_accuracy}')

Testing Accuracy: 0.954468802698145


In [14]:
input_message = "New Offer! Save upto 40\% electricity bill with Power Saver(GOVT. LAB TESTED), Rs. 1050/-(free home delivery) 3 Yr. Guarantee Call 9891943823,9891943780"

input_transformed = feature_extraction.transform([input_message])

prediction = model.predict(input_transformed)

print(prediction)

if prediction[0] == 0:
    print("This is a safe message")

elif prediction[0] == 1:
    print("This is a spam message")

else:
    print("This is a phishing message")

[1]
This is a spam message


In [15]:
joblib.dump(model, 'phishing_message_model.pkl')
joblib.dump(feature_extraction, 'message_tfidf_vectorizer.pkl')

['message_tfidf_vectorizer.pkl']