# Email Classification as spam or not



Preprocessing data

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize

In [None]:
data=pd.read_csv('spam.csv',encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
data=data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.columns=['label','message']
data.columns

Index(['label', 'message'], dtype='object')

In preprocessing, we lower the case of all words, remove url, mail id,  and then remove the stop words. After this we tokenize and lemmatize the earlier words.

In [None]:
# to get pos tag for each words
def get_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# preprocessing text
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text=text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words= [word for word in words if word not in stop_words]
    pos_tags = nltk.pos_tag(words)
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_pos(tag))
        for word, tag in pos_tags]
    return ' '.join(lemmatized_words)

In [None]:
data['cleaned']=data['message'].apply(lambda x: preprocess(x))
data.head()

Unnamed: 0,label,message,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


Training the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score
from sklearn.utils import class_weight
import numpy as np

In [None]:
data=data.replace({'ham':0,'spam':1})
X=data['cleaned']
Y=data['label']
vectorizer=TfidfVectorizer()
X_tfidf=vectorizer.fit_transform(X)
x_train,x_test,y_train,y_test=train_test_split(X_tfidf,Y,test_size=.3,random_state=0)

In [None]:
model=MultinomialNB()
model.fit(x_train,y_train)
Y_hat=model.predict(x_test)

In [None]:

print("Classification report:", classification_report(y_test,Y_hat))
print("Accuracy:",accuracy_score(y_test,Y_hat))

Classification report:               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1434
           1       1.00      0.65      0.79       238

    accuracy                           0.95      1672
   macro avg       0.97      0.83      0.88      1672
weighted avg       0.95      0.95      0.95      1672

Accuracy: 0.9503588516746412


In [None]:

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Method 1: Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled_ros, y_resampled_ros = ros.fit_resample(x_train, y_train)

# Method 2: Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_resampled_rus, y_resampled_rus = rus.fit_resample(x_train, y_train)

# Method 3: SMOTE
smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(x_train, y_train)

# Random Oversampling
model_ros = MultinomialNB()
model_ros.fit(X_resampled_ros, y_resampled_ros)
y_pred_ros = model_ros.predict(x_test)
print("Random Oversampling Results:\n", classification_report(y_test, y_pred_ros))

# Random Undersampling
model_rus = MultinomialNB()
model_rus.fit(X_resampled_rus, y_resampled_rus)
y_pred_rus = model_rus.predict(x_test)
print("Random Undersampling Results:\n", classification_report(y_test, y_pred_rus))

# SMOTE
model_smote = MultinomialNB()
model_smote.fit(X_resampled_smote, y_resampled_smote)
y_pred_smote = model_smote.predict(x_test)
print("SMOTE Results:\n", classification_report(y_test, y_pred_smote))

Random Oversampling Results:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1434
           1       0.85      0.94      0.89       238

    accuracy                           0.97      1672
   macro avg       0.92      0.96      0.94      1672
weighted avg       0.97      0.97      0.97      1672

Random Undersampling Results:
               precision    recall  f1-score   support

           0       0.99      0.91      0.95      1434
           1       0.65      0.97      0.78       238

    accuracy                           0.92      1672
   macro avg       0.82      0.94      0.86      1672
weighted avg       0.94      0.92      0.93      1672

SMOTE Results:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1434
           1       0.83      0.93      0.88       238

    accuracy                           0.96      1672
   macro avg       0.91      0.95      0.93      16

The performance metrics indicate that the model is performing well after applying Random Oversampling. The balance between precision and recall, especially for the minority class (spam), is good.