In [28]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [29]:
df = pd.read_csv('/content/emailgood.csv')
df.head()
df.tail()

Unnamed: 0,title,text,type
395,Newspaper risk care,"Andrew Acosta,\n\nForeign third participant po...",not spam
396,Easy consider pass answer,"Brian Anderson,\n\nRecent evening land teach c...",spam
397,Girl plant special suggest,"Katherine Stevens,\n\nFact indeed think natura...",spam
398,Billion put cell control,"Kenneth Dennis,\n\nDescribe near feeling profe...",not spam
399,Memory who,"Lisa Lopez,\n\nCommercial control with cell re...",spam


In [30]:
df['type'].head()

Unnamed: 0,type
0,spam
1,not spam
2,not spam
3,not spam
4,spam


In [31]:
df[['title','text']].duplicated().sum()

np.int64(2)

In [32]:
# Drop duplicate rows based on the 'title' and 'text' columns
df = df.drop_duplicates(subset=['title', 'text'])

# Optional: Check the number of remaining rows
print(f"Number of rows after dropping duplicates: {len(df)}")

# Optional: Verify that duplicates are removed
print(f"Number of duplicates after dropping: {df[['title','text']].duplicated().sum()}")

Number of rows after dropping duplicates: 398
Number of duplicates after dropping: 0


In [33]:
df.shape

(398, 3)

In [34]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
type,0


In [35]:
df.dropna(inplace=True)

In [36]:
df['type'] = df['type'].map({'spam':1,'not spam':0})
df.head()

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",1
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",0
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",0
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",0
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",1


In [37]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
type,0


In [38]:
stop_word = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemma = nltk.WordNetLemmatizer()

In [39]:
def text_prossing(text):
  text = text.lower()
  text = re.sub(r'[@?*:-]', '', text)
  text = re.sub('https?://\S+|www\.\S+','',text)
  token = word_tokenize(text)
  token = [ word for word in token if word not in stop_word]
  stem =[lemma.lemmatize(word) for word in token ]
  return ' '.join(stem)

  text = re.sub('https?://\S+|www\.\S+','',text)


In [40]:
#df['title'] = df['title'].apply(text_prossing)
#df['text']=df['text'].apply(text_prossing)

In [41]:
def combined_tokenized(text):
  return text_prossing(text).split()

In [42]:
PreProcessor = ColumnTransformer(transformers=[
    ('title_tfdif',TfidfVectorizer(tokenizer=combined_tokenized),'title'),
    ('text_tfdif',TfidfVectorizer(tokenizer=combined_tokenized),'text')
])


In [43]:
models = [
    ('rf',RandomForestClassifier(n_estimators=50, random_state=2)),
    ('lr',LogisticRegression(max_iter=1000,solver='liblinear', penalty='l1')),
    ('svm',SVC(probability=True))]

In [44]:
voting_clf = VotingClassifier(estimators=models, voting='soft')


In [45]:
pipeline = Pipeline([
    ('preprocessor', PreProcessor),
    ('classifier', voting_clf)
])


In [46]:
x=df[['title','text']]
y=df['type']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [47]:
pipeline.fit(x_train,y_train)



In [48]:
y_pred_voting = pipeline.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_voting):.2f}")


Accuracy: 0.95


In [49]:
preprocessor = pipeline.named_steps['preprocessor']
voting_clf = pipeline.named_steps['classifier']

X_test_transformed = preprocessor.transform(x_test)

print("\nIndividual Model Performance:")
for (name, original_model), fitted_model in zip(voting_clf.estimators, voting_clf.estimators_):
    y_pred = fitted_model.predict(X_test_transformed)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name.upper()} Model:")
    print(f"Accuracy: {acc:.2f}")
    print(classification_report(y_test, y_pred))


Individual Model Performance:

RF Model:
Accuracy: 0.93
              precision    recall  f1-score   support

           0       0.95      0.90      0.92        39
           1       0.91      0.95      0.93        41

    accuracy                           0.93        80
   macro avg       0.93      0.92      0.92        80
weighted avg       0.93      0.93      0.92        80


LR Model:
Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        39
           1       1.00      0.88      0.94        41

    accuracy                           0.94        80
   macro avg       0.94      0.94      0.94        80
weighted avg       0.94      0.94      0.94        80


SVM Model:
Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        39
           1       1.00      0.90      0.95        41

    accuracy                           0.95        80
   macro avg      

In [50]:
import pickle
with open('spam_classifier.pkl', 'wb') as f:
    pickle.dump(pipeline, f)


In [51]:
with open('spam_classifier.pkl', 'rb') as f:
    model = pickle.load(f)

new_email = {
    'title': "Earn $5000 per week from home",
    'text': "Hi How are you"
}


input_data = pd.DataFrame([new_email])

prediction = model.predict(input_data)[0]

print(f"Prediction: {'SPAM' if prediction == 1 else 'HAM'}")

Prediction: HAM
