## Usefull libraries 

In [1]:
import pandas as pd 
import numpy as np 

import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.svm import SVC
from sklearn.metrics import f1_score

### Loading the data 

In [2]:
missing_values = ["n/a", "na", "--" , "empty" , "?" , "??" , "---" , "-", "__" , "___" , "___ ___"]
df = pd.read_csv(r"C:\Users\ROHAN SHARMA\Downloads\archive (1)\completeSpamAssassin.csv", na_values=missing_values)


In [3]:
df.shape

(6046, 3)

In [5]:
class primitive_analysis:
    
    def __init__(self):
        print("--- dataset overview:--- \n")
        print(df.head(5))
        print(df.tail(5))
    
    def info(self, df):
        print("\n--- dataset info :---\n")
        print(df.info())
        
    def dimensions(self, df):
        print("\n---no. of dimensions = ", df.ndim)
        
    def size(self, df):
        print("\n---size of dataset = ", df.size)
        
    def axes(self, df):
        print("\n---list of the labels  ", df.axes)
        
    def empty(self, df):
        print("is dataset empty? ", df.empty)

obj_primitive = primitive_analysis()
obj_primitive.info(df )
obj_primitive.dimensions(df)
obj_primitive.axes(df)
obj_primitive.empty(df)

--- dataset overview:--- 

   Unnamed: 0                                               Body  Label
0           0  \nSave up to 70% on Life Insurance.\nWhy Spend...      1
1           1  1) Fight The Risk of Cancer!\nhttp://www.adcli...      1
2           2  1) Fight The Risk of Cancer!\nhttp://www.adcli...      1
3           3  ##############################################...      1
4           4  I thought you might like these:\n1) Slim Down ...      1
      Unnamed: 0                                               Body  Label
6033        6033  ----------------------------------------------...      0
6034        6034  EFFector       Vol. 15, No. 35       November ...      0
6039        6039  \nWe have extended our Free seat sale until Th...      0
6042        6042                    ___           ___           ...      0
6043        6043  IN THIS ISSUE:01. Readers write\n02. Extension...      0

--- dataset info :---

<class 'pandas.core.frame.DataFrame'>
Index: 5512 entries, 0 to 604

In [8]:
x = df["Body"]
y = df["Label"]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=3 , test_size=0.2 )
print(x.shape , x_train.shape , x_test.shape)

(5512,) (4409,) (1103,)


In [13]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [21]:
def remove_urls(text):
    url_pattern = r'(http|https)://[^\s]*'
    return re.sub(url_pattern, '', text)

In [22]:
x_train = x_train.apply(remove_urls)
x_test = x_test.apply(remove_urls)

In [23]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

# Transform the text data
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [24]:
model = LogisticRegression()
model.fit(x_train_features, y_train)

# Making predictions on the test set
y_pred = model.predict(x_test_features)

In [25]:
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1}")

F1 Score: 0.9658928524201731


In [27]:
# Bag-of-Words (BoW) representation
vectorizer_bow = CountVectorizer()
bow_matrix = vectorizer_bow.fit_transform(df['Body'])

# TF-IDF representation
vectorizer_tfidf = TfidfVectorizer()
tfidf_matrix = vectorizer_tfidf.fit_transform(df['Body'])

In [29]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

In [30]:
models = [
    SVC(),
    RandomForestClassifier(),
    MultinomialNB()
]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(bow_matrix, df['Label'], test_size=0.2, random_state=42)

In [34]:
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    model_name = type(model).__name__
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("---")

Model: SVC
Accuracy: 0.9256572982774252
Precision: 0.966789667896679
Recall: 0.7820895522388059
F1 Score: 0.8646864686468647
---
Model: RandomForestClassifier
Accuracy: 0.9682683590208522
Precision: 0.9838709677419355
Recall: 0.9104477611940298
F1 Score: 0.9457364341085271
---
Model: MultinomialNB
Accuracy: 0.9891205802357208
Precision: 0.9908814589665653
Recall: 0.9731343283582089
F1 Score: 0.9819277108433735
---
