In [1]:
import pandas as pd        
from xgboost import XGBClassifier    
from sklearn.ensemble import RandomForestClassifier      
from sklearn.model_selection import train_test_split   
from sklearn.feature_extraction.text import TfidfVectorizer  

from sklearn.metrics import accuracy_score, classification_report   

In [2]:
data = pd.read_csv("emails.csv")

In [3]:
data.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


In [4]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [6]:
data.shape

(5728, 2)

In [7]:
data=data.drop_duplicates()
data


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [8]:
data.shape

(5695, 2)

In [9]:
data['cleaned_text'] = data['text'].str.lower() \
    .str.replace(r'http\S+|www\S+|\S+\.com', ' <URL> ', regex=True) \
    .str.replace(r'\d+', ' <NUM> ', regex=True) \
    .str.replace(r'[^a-z\s<URL><NUM>$!]', ' ', regex=True) \
    .apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2 or w in ['win','won','claim','pay','$','!']]))

In [10]:
data.head()

Unnamed: 0,text,spam,cleaned_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny mer...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy wanti...
3,Subject: 4 color printing special request add...,1,subject <NUM> color printing special request a...
4,"Subject: do not have money , get software cds ...",1,subject not have money get software cds from h...


In [11]:
X=data['cleaned_text']
y=data['spam']

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
print(X_train.shape)
print(X_test.shape)

(4556,)
(1139,)


In [14]:

vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1, 2),max_features=10000,min_df=5)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train_tfidf, y_train)

In [16]:
y_pred = model.predict(X_test_tfidf)

In [17]:
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.9798068481123793

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       843
           1       0.98      0.94      0.96       296

    accuracy                           0.98      1139
   macro avg       0.98      0.97      0.97      1139
weighted avg       0.98      0.98      0.98      1139



In [18]:
test_email = ["Trevin, you have won a $1000 gift card! Click here to claim your prize now!  Don't miss out on this amazing opportunity!"]

test_email_tfidf = vectorizer.transform(test_email)
prediction = model.predict(test_email_tfidf)[0]
print("\nCustom Email Prediction:", "Spam" if prediction == 1 else "Not Spam")



Custom Email Prediction: Spam


In [19]:
from joblib import dump
dump(model, "models/model.joblib")
dump(vectorizer, "models/vectorizer.joblib")

['models/vectorizer.joblib']