In [21]:
import numpy as np
import pandas as pd

In [22]:
df = pd.read_csv('spam.csv', encoding = 'ISO-8859-1')

In [None]:
df.head()

In [24]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [None]:
df.head()

In [26]:
df.rename(columns={'v1' : 'target', 'v2':'text'}, inplace=True)

In [None]:
df.head()

In [28]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
df['target']

In [30]:
df['target'] = label_encoder.fit_transform(df['target'])

In [None]:
df.sample(5)

In [None]:
df.isnull().sum()

In [None]:
print(df.duplicated().sum())

In [None]:
df.shape

In [35]:
df = df.drop_duplicates(keep='first')

In [None]:
print(df.duplicated().sum())

In [None]:
df.shape

In [None]:
df['target'].value_counts()

In [39]:
# !pip install matplotlib

In [40]:
# import matplotlib.pyplot as plt
# plt.pie(df['target'].value_counts(), labels=['ham', 'spam'], autopct="%0.2f")
# plt.show()

In [41]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_characters'] = df['text'].apply(len)
df.head()

In [None]:
df['num_word'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df.head()

In [None]:
df['num_sentence'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
df.head()

In [None]:
df[df['target'] == 0][['num_characters', 'num_word', 'num_sentence']].describe()

In [None]:
df[df['target'] == 1][['num_characters', 'num_word', 'num_sentence']].describe()

In [None]:
## Data Preprocessing

import string
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [49]:
def transform_text(text):
    ps = PorterStemmer()
    
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []

    # remove special charcaters
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()

    # remove stopwords
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    text = y[:]
    y.clear()

    # Stemming
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)


In [50]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.sample(5)

In [52]:
## Model Building

In [53]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [54]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
X.shape

In [56]:
y = df['text'].values

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

In [58]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [59]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))