## Importing the Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

## Load the dataset

In [4]:
# Load data
df = pd.read_csv('C:/Users/Pranathi/OneDrive/Desktop/project/uploads/validated_emails.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199585 entries, 0 to 199584
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    199583 non-null  object
 1   label   199580 non-null  object
dtypes: object(2)
memory usage: 3.0+ MB


In [6]:
df.head()

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,spam
1,Subject: the stock trading gunslinger fanny i...,spam
2,Subject: unbelievable new homes made easy im ...,spam
3,Subject: 4 color printing special request add...,spam
4,"Subject: do not have money , get software cds ...",spam


## Remove the NaN values

In [7]:
# Remove NaN values 
df = df.dropna()

## Mapping label to integer values

In [8]:
# Keep only valid labels and map: ham -> 0, spam -> 1
valid_labels = {"ham": 0, "Ham": 0, "spam": 1, "Spam": 1}
df = df[df['label'].isin(valid_labels.keys())]
df['label'] = df['label'].map(valid_labels).astype(int)

In [9]:
# Drop rows where text is NaN or empty
df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != ""]

In [10]:
print(df)

                                                     text  label
0       Subject: naturally irresistible your corporate...      1
1       Subject: the stock trading gunslinger  fanny i...      1
2       Subject: unbelievable new homes made easy  im ...      1
3       Subject: 4 color printing special  request add...      1
4       Subject: do not have money , get software cds ...      1
...                                                   ...    ...
199580  on escapenumber escapenumber escapenumber rob ...      0
199581  we have everything you need escapelong cialesc...      1
199582  hi quick question say i have a date variable i...      0
199583  thank you for your loan request which we recie...      1
199584  this is an automatically generated delivery st...      0

[199576 rows x 2 columns]


In [11]:
# Check distribution
print("Label distribution:\n", df['label'].value_counts())

Label distribution:
 label
0    106517
1     93059
Name: count, dtype: int64


In [12]:
df.head()

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [13]:
X = df['text']
y = df['label']

In [14]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Ensure text column has no NaN or empty values
X_train = X_train.dropna().astype(str)
X_train = X_train[X_train.str.strip() != ""]

X_test = X_test.dropna().astype(str)
X_test = X_test[X_test.str.strip() != ""]

In [16]:
# Vectorize text
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [17]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)
print("Naive Bayes Results:") 
print(classification_report(y_test, y_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))

Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91     21430
           1       0.92      0.88      0.90     18486

    accuracy                           0.91     39916
   macro avg       0.91      0.90      0.91     39916
weighted avg       0.91      0.91      0.91     39916

Accuracy: 0.9065036576811304


In [18]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=3, min_child_weight=2, subsample=0.8, reg_alpha=1, reg_lambda=1)
xgb.fit(X_train_vec, y_train)
y_pred_xgb = xgb.predict(X_test_vec)
print("\nXGBoost Results:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Results:
              precision    recall  f1-score   support

           0       0.97      0.92      0.95     21430
           1       0.91      0.97      0.94     18486

    accuracy                           0.94     39916
   macro avg       0.94      0.95      0.94     39916
weighted avg       0.95      0.94      0.94     39916

Accuracy: 0.9433811003106524
