In [None]:
import numpy as np
import pandas as pd

df=pd.read_csv("spam.csv",encoding="latin-1")

df.head()

df.info()

df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

df.sample(10)

df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(10)

from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()

df['target']=encoder.fit_transform(df['target'])

df.head()

df.isnull().sum()

df.duplicated().sum()

df=df.drop_duplicates(keep='first')

df.duplicated().sum()

df['target'].value_counts()

import matplotlib.pyplot as plt

plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()


import nltk

nltk.download('punkt')

df['num_char']=df['text'].apply(len)

df.head()

df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

df.head()

df['num_sent']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

df.head()

df.head()

df[['num_char','num_words','num_sent']].describe()

df[df['target']==0][['num_char','num_words','num_sent']].describe()

df[df['target']==1][['num_char','num_words','num_sent']].describe()

import seaborn as sns

plt.figure(figsize=(12,8))
sns.histplot(df[df['target']==0]['num_char'])
sns.histplot(df[df['target']==1]['num_char'],color='red')

plt.figure(figsize=(12,8))
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')

plt.figure(figsize=(12,8))
sns.histplot(df[df['target']==0]['num_sent'])
sns.histplot(df[df['target']==1]['num_sent'],color='red')

sns.pairplot(df,hue='target')

from nltk.corpus import stopwords
import string
string.punctuation
nltk.download('stopwords')

from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

def trans(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text=y[:]
    y.clear()
    for  i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    
    text=y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
        
    return " ".join(y)

df['transformed']=df['text'].apply(trans)

df.head()


from wordcloud import WordCloud

wc=WordCloud(width=600,height=600,min_font_size=10,background_color='white')

spam_wc=wc.generate(df[df['target']==1]['transformed'].str.cat(sep=" "))
plt.imshow(spam_wc)

ham_wc=wc.generate(df[df['target']==0]['transformed'].str.cat(sep=" "))
plt.imshow(spam_wc)

spam_corpus=[]
for msg in df[df['target']==1]['transformed'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

len(spam_corpus)

from collections import Counter
pd.DataFrame(Counter(spam_corpus).most_common(30))

spam_corpus=[]
for msg in df[df['target']==0]['transformed'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

len(spam_corpus)

from collections import Counter
pd.DataFrame(Counter(spam_corpus).most_common(30))

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

x=cv.fit_transform(df['transformed']).toarray()

x.shape

y=df['target'].values

y

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

from sklearn.metrics import confusion_matrix,accuracy_score,precision_score

gnb.fit(x_train,y_train)
y_pred1=gnb.predict(x_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

mnb.fit(x_train,y_train)
y_pred2=mnb.predict(x_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

bnb.fit(x_train,y_train)
y_pred3=bnb.predict(x_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_features=3000)

x1=tfidf.fit_transform(df['transformed']).toarray()

#from sklearn.preprocessing import MinMaxScaler
#scaler=MinMaxScaler()
#x1=scaler.fit_transform(x1)

y1=df['target'].values

x_train1,x_test1,y_train1,y_test1=train_test_split(x1,y1,test_size=0.2,random_state=2)

gnb1=GaussianNB()
mnb1=MultinomialNB()
bnb1=BernoulliNB()

gnb1.fit(x_train1,y_train1)
y_pred11=gnb1.predict(x_test1)
print(accuracy_score(y_test1,y_pred11))
print(confusion_matrix(y_test1,y_pred11))
print(precision_score(y_test1,y_pred11))

mnb1.fit(x_train1,y_train1)
y_pred21=mnb1.predict(x_test1)
print(accuracy_score(y_test1,y_pred21))
print(confusion_matrix(y_test1,y_pred21))
print(precision_score(y_test1,y_pred21))

bnb1.fit(x_train1,y_train1)
y_pred31=bnb1.predict(x_test1)
print(accuracy_score(y_test1,y_pred31))
print(confusion_matrix(y_test1,y_pred31))
print(precision_score(y_test1,y_pred31))

import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb1,open('mnb1_model.pkl','wb'))