# Import all the required libaries

In [552]:
import string
from textblob import TextBlob
from nltk.corpus import stopwords
import re
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Dataset

In [553]:
data= pd.read_csv('/kaggle/input/spam-email-classification/email.csv')

In [554]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [555]:
data.shape

(5573, 2)

In [556]:
data['Message']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
5572                                      isActive:false}
Name: Message, Length: 5573, dtype: object

In [557]:
data['Category'].unique()

array(['ham', 'spam', '{"mode":"full"'], dtype=object)

clean the data

In [558]:
data.drop(data[data['Category']=='{"mode":"full"'].index,axis=0,inplace=True)

In [559]:
data['Category'].unique()

array(['ham', 'spam'], dtype=object)

# Pre-Processing of dataset

In [560]:
# remove all the punctuations
def remove_punc(text):
    for char in string.punctuation:
        text= text.replace(char,'')
    return text

In [561]:
data['Message']= data['Message'].apply(remove_punc)

In [562]:
# lower casing of data
data['Message']= data['Message'].str.lower()

In [563]:
# spelling correction
def spel_check(text):
    for char in text:
        text_blob= TextBlob(char)
        text_blob.correct().string
    return text


In [564]:
data['Message']= data['Message'].apply(spel_check)

In [565]:
# remove all the stop words
words= stopwords.words('english')
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in words:
            new_text.append(' ')
        else:
            new_text.append(word)
    x= new_text[:]
    new_text.clear()
    return" ".join(x)

In [566]:
data['Message']= data['Message'].apply(remove_stopwords)

In [567]:
# remove all the urls
def remove_url(pattern):
    pattern= re.sub('http://\S+|https://\S+', '', pattern)
    return(pattern)

In [568]:
data['Message']= data['Message'].apply(remove_url)


In [569]:
# remove all the accents
import unicodedata
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [570]:
data['Message']= data['Message'].apply(strip_accents)
data['Message']

0       go   jurong point crazy available     bugis n ...
1                                 ok lar joking wif u oni
2       free entry   2   wkly comp   win fa cup final ...
3                 u dun say   early hor u c already   say
4       nah   dont think   goes   usf   lives around  ...
                              ...                        
5567          2nd time     tried 2 contact u u       £...
5568                        u b going   esplanade fr home
5569                pity     mood     soany   suggestions
5570      guy     bitching     acted like id   interes...
5571                                 rofl   true     name
Name: Message, Length: 5572, dtype: object

# Tokenization of Data

In [571]:
def conv_doc(text):
    doc= word_tokenize(text)
    return doc
        

In [572]:
data['Message']= data['Message'].apply(conv_doc)
data['Message']

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, £750, pou...
5568                   [u, b, going, esplanade, fr, home]
5569                     [pity, mood, soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
Name: Message, Length: 5572, dtype: object

# Inflection of Data

In [573]:
ps= SnowballStemmer(language='english')
def stem_words(text):
    return " ".join([ps.stem(word) for word in text])

In [574]:
data['Message']= data['Message'].apply(stem_words)
data['Message']

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkts 2...
3                     u dun say earli hor u c alreadi say
4               nah dont think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u u £750 pound prize 2 ...
5568                              u b go esplanad fr home
5569                              piti mood soani suggest
5570    guy bitch act like id interest buy someth els ...
5571                                       rofl true name
Name: Message, Length: 5572, dtype: object

# Spliting of Dataset

In [575]:
x_train, x_test, y_train, y_test= train_test_split(data['Message'].values, data['Category'].values, test_size=0.25, random_state=42)

In [576]:
x_train

array(['winner valu network custom select receivea £900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
       'how scotland hope show jjc tendenc take care live dream',
       'derek done class', ..., 'prabhaim sorydarealyfrm heart im sori',
       'nt joke serious told', 'say somebodi name tampa'], dtype=object)

# Creation of Pipelines

In [577]:
pipeline_rf= Pipeline([('tfidf2',TfidfVectorizer()),
                       ('rf_classifier', RandomForestClassifier())])
pipeline_mnb= Pipeline([('tfidf',TfidfVectorizer()),
                        ('mnb_classifier', MultinomialNB(alpha=0.8, fit_prior=True, force_alpha=True))])
pipeline_svm= Pipeline([('tfidf3',TfidfVectorizer()),
                        ('svm_classifier',SVC(kernel="rbf", gamma=0.5, C=1.0) )])

In [578]:
pipeline_mnb1= Pipeline([('count',CountVectorizer()),
                        ('mnb_classifier1', MultinomialNB(alpha=0.8, fit_prior=True, force_alpha=True))])
pipeline_svm1= Pipeline([('count3',CountVectorizer()),
                        ('svm_classifier1',SVC(kernel="rbf", gamma=0.5, C=1.0) )])

In [579]:
pipelines= [pipeline_rf, pipeline_mnb, pipeline_svm]
for pipe in pipelines:
    pipe.fit(x_train, y_train)

In [580]:
pipelines1= [pipeline_mnb1, pipeline_svm1]
for pipe in pipelines1:
    pipe.fit(x_train, y_train)

In [581]:
best_accuracy=0
best_classifier=0
best_pipeline=''

In [584]:
pipe_dict={0:'RandomForest Classifier', 1:'MultinomialNB', 2:'SVM classifier'}

In [585]:
pipe_dict1={0:'MultinomialNB',1:'SVM classifier'}

# Accuracy Score

In [587]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(x_test, y_test)))

RandomForest Classifier Test Accuracy: 0.9727207465900933
MultinomialNB Test Accuracy: 0.9691313711414213
SVM classifier Test Accuracy: 0.9798994974874372


In [588]:
for i,model in enumerate(pipelines1):
    print("{} Test Accuracy: {}".format(pipe_dict1[i],model.score(x_test, y_test)))

MultinomialNB Test Accuracy: 0.9849246231155779
SVM classifier Test Accuracy: 0.9081119885139985


# Predict model on new data

In [589]:
email=[
    'Photography 4 Humanity Global Prize Competition 2024| Win Cash Prizes of Rs. 4 L+ by the UN Human Rights Council',
    'You have won a TV in lucky draw. Join to know more',
    'How are you my son'
]

In [590]:
for i, pipe in enumerate(pipelines):
        print("Predicted outcome of {} is:".format(pipe_dict[i]),pipe.predict(email))

Predicted outcome of RandomForest Classifier is: ['ham' 'ham' 'ham']
Predicted outcome of MultinomialNB is: ['spam' 'ham' 'ham']
Predicted outcome of SVM classifier is: ['ham' 'ham' 'ham']


In [591]:
for i, pipe in enumerate(pipelines1):
        print("Predicted outcome of {} is:".format(pipe_dict1[i]),pipe.predict(email))

Predicted outcome of MultinomialNB is: ['spam' 'spam' 'ham']
Predicted outcome of SVM classifier is: ['ham' 'ham' 'ham']


# Best Model

In [592]:
for i,model in enumerate(pipelines1):
    if model.score(x_test,y_test)>best_accuracy:
        best_accuracy=model.score(x_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy : {}'.format(pipe_dict1[best_classifier]))

Classifier with best accuracy : MultinomialNB


# Model saving

In [593]:
import pickle

In [596]:
pickle.dump(pipeline_mnb1, open('Email_Classifier_model.pkl', 'wb'))

# 