#### Importing modules

In [None]:
import pandas as pd
import string
import re

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score


#### Collecting data

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [None]:
print(f'{df.head()}\n{df.shape}\n{df.columns}')

# Data Cleaning

In [None]:
#find number of null values in each column
df.isnull().sum()

In [None]:
#data cleaning on the data permanently
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
df.head()

In [None]:
df.rename(columns={'v1':'label','v2':'email'},inplace=True)
df.head()

In [None]:
df.columns.to_list()

### EDA

In [None]:
#Eda on the data
df['label'].value_counts()

In [None]:
#convert categorical column to numeric: using Label encoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
df.head()


In [None]:
df['label'].unique()

In [None]:
df.columns

In [None]:
#missing values
df.isnull().sum()

In [None]:
#check for duplicate values
df.duplicated().sum()

In [None]:
#remove duplicate values
df.drop_duplicates(keep='first',inplace=True)
df.duplicated().sum()


In [None]:
df.shape

In [None]:
#eda on the data
df['label'].value_counts()


In [None]:
#plot ratio of ham vs spam
sns.countplot(df['label'])
plt.show()




In [None]:
#plot piechart with ratio of ham vs spam
plt.pie(df['label'].value_counts(),labels=['ham','spam'],autopct='%0.1f%%')
plt.show()


In [None]:
#tokenize the above corpus with word_tokenize and store it in a column named num_words
df['num_words'] = df['email'].apply(lambda x: len(word_tokenize(x)))
df.head()


In [None]:
df['num_characters']=df['email'].apply(len)

In [None]:
df.head()

In [None]:
#sent_tokenize and store it in a column named num_sent
df['num_sent'] = df['email'].apply(lambda x: len(sent_tokenize(x)))
df.head()

In [None]:
#describe
df.describe()

In [None]:
#describe ham
df[df['label']==0].describe()

In [None]:
#describe spam
df[df['label']==1].describe()


In [None]:
#plot histogram of num_characters for ham and spam using sns
sns.distplot(df[df['label']==0]['num_characters'],label='ham')
sns.distplot(df[df['label']==1]['num_characters'],label='spam')
plt.legend()
plt.show()

In [None]:
#pairplot
sns.pairplot(df,hue='label')
plt.show()

In [None]:
#correlation
df.corr()
#heatmap
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
#lower case
df['email'] = df['email'].apply(lambda x: x.lower())
df.head()

In [None]:
#tokenization
df['email'] = df['email'].apply(lambda x: word_tokenize(x))
df.head()

In [None]:
#remove special characters

df['email'] = df['email'].apply(lambda x: [re.sub(r'[^a-zA-Z0-9]','',i) for i in x])
df.head()

In [None]:
#remove stop words

stop_words = stopwords.words('english')
df['email'] = df['email'].apply(lambda x: [i for i in x if i not in stop_words])
df.head()


In [None]:
#remove punctuation

df['email'] = df['email'].apply(lambda x: [i for i in x if i not in string.punctuation])
df.head()

In [None]:
#stemming

ps = PorterStemmer()
df['email'] = df['email'].apply(lambda x: [ps.stem(i) for i in x])
df.head()


In [None]:
#store transformed text into df['transformed_text']
df['transformed_text'] = df['email']
df['transformed_text'] 


### using BOW

In [None]:
#conversion of text into numeric vector using BOW

cv = CountVectorizer(max_features=500)
X = cv.fit_transform(df['transformed_text'].apply(lambda x: ' '.join(x)))
X.shape


In [None]:
df.head()

In [None]:
y=df['label'].values

In [None]:
#split dataset into train and test

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape


In [None]:
#train model using Naive Bayes
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred
print(f'{accuracy_score(y_test,y_pred)},\n{precision_score(y_test,y_pred)},\n{confusion_matrix(y_test,y_pred)},\n{recall_score(y_test,y_pred)},\n{f1_score(y_test,y_pred)}')
#get the heatmap of naive bayes
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap="Greens")
plt.show()

In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(recall, precision,'g')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
#train model with gaussian naive bayes
model = GaussianNB()
model.fit(X_train.toarray(),y_train)
y_pred = model.predict(X_test.toarray())
y_pred
print(f'{accuracy_score(y_test,y_pred)},\n{precision_score(y_test,y_pred)},\n{confusion_matrix(y_test,y_pred)},\n{recall_score(y_test,y_pred)},\n{f1_score(y_test,y_pred)}')

In [None]:
#train model with bernoulli naive bayes
model = BernoulliNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred
print(f'{accuracy_score(y_test,y_pred)},\n{precision_score(y_test,y_pred)},\n{confusion_matrix(y_test,y_pred)}\n{recall_score(y_test,y_pred)},\n{f1_score(y_test,y_pred)}')

In [None]:
# Find the accuracy score uding tfidf vectorizer

tfidf = TfidfVectorizer(max_features=2500)
X = tfidf.fit_transform(df['transformed_text'].apply(lambda x: ' '.join(x)))
X.shape

### using Tf-Idf

In [None]:
#conversion of text into numeric vector using Tf-Idf


tfidf = TfidfVectorizer(max_features=500)
X = tfidf.fit_transform(df['transformed_text'].apply(lambda x: ' '.join(x)))
X.shape

In [None]:
df.head()

In [None]:
#split dataset into train and test
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
#train model using Naive Bayes
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred
print(accuracy_score(y_test,y_pred),
precision_score(y_test,y_pred),
confusion_matrix(y_test,y_pred))

In [None]:
#train model with gaussian naive bayes
model = GaussianNB()
model.fit(X_train.toarray(),y_train)
y_pred = model.predict(X_test.toarray())
y_pred
print(accuracy_score(y_test,y_pred),
precision_score(y_test,y_pred),
confusion_matrix(y_test,y_pred))

In [None]:
#train model with bernoulli naive bayes
model = BernoulliNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred
print(accuracy_score(y_test,y_pred),
precision_score(y_test,y_pred),
confusion_matrix(y_test,y_pred))

In [None]:
# Find the accuracy score uding tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2500)
X = tfidf.fit_transform(df['transformed_text'].apply(lambda x: ' '.join(x)))
X.shape