## Importing the Libraries

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

## Load the dataset

In [19]:
# Load data
df = pd.read_csv('C:/CGI/Project/Email-Validation/uploads/validated_emails.csv')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205315 entries, 0 to 205314
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    205313 non-null  object
 1   label   205309 non-null  object
dtypes: object(2)
memory usage: 3.1+ MB


In [21]:
df.head()

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,spam
1,Subject: the stock trading gunslinger fanny i...,spam
2,Subject: unbelievable new homes made easy im ...,spam
3,Subject: 4 color printing special request add...,spam
4,"Subject: do not have money , get software cds ...",spam


## Remove the NaN values

In [22]:
# Remove NaN values 
df = df.dropna()

## Mapping label to integer values

In [6]:
# Keep only valid labels and map: ham -> 0, spam -> 1
valid_labels = {"ham": 0, "Ham": 0, "spam": 1, "Spam": 1}
df = df[df['label'].isin(valid_labels.keys())]
df['label'] = df['label'].map(valid_labels).astype(int)

In [7]:
# Drop rows where text is NaN or empty
df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != ""]

In [8]:
print(df)

                                                     text  label
0       Subject: naturally irresistible your corporate...      1
1       Subject: the stock trading gunslinger  fanny i...      1
2       Subject: unbelievable new homes made easy  im ...      1
3       Subject: 4 color printing special  request add...      1
4       Subject: do not have money , get software cds ...      1
...                                                   ...    ...
205310  on escapenumber escapenumber escapenumber rob ...      0
205311  we have everything you need escapelong cialesc...      1
205312  hi quick question say i have a date variable i...      0
205313  thank you for your loan request which we recie...      1
205314  this is an automatically generated delivery st...      0

[205303 rows x 2 columns]


In [9]:
# Check distribution
print("Label distribution:\n", df['label'].value_counts())

Label distribution:
 label
0    110875
1     94428
Name: count, dtype: int64


In [10]:
df.head()

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
X = df['text']
y = df['label']

In [11]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Ensure text column has no NaN or empty values
X_train = X_train.dropna().astype(str)
X_train = X_train[X_train.str.strip() != ""]

X_test = X_test.dropna().astype(str)
X_test = X_test[X_test.str.strip() != ""]

In [16]:
# Vectorize text
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [17]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)
print("Naive Bayes Results:") 
print(classification_report(y_test, y_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))

ValueError: Input contains NaN

In [None]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=3, min_child_weight=2, subsample=0.8, reg_alpha=1, reg_lambda=1)
xgb.fit(X_train_vec, y_train)
y_pred_xgb = xgb.predict(X_test_vec)
print("\nXGBoost Results:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))