# **LOAD DATA**

In [None]:
import numpy as np
import pandas as pd
import re
import string

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("/content/Dataset_Spam.txt", sep = "\t", names=["label", "message"])

In [None]:
data = data.drop_duplicates()

In [None]:
data.shape

In [None]:
plt.figure(figsize=(9, 5))

plt.pie(data['label'].value_counts(),labels=['ham','spam'],autopct='%0.2f', colors=['Green', 'Red'], explode = [.1, .1])
plt.show()

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
data['num_characters']=data['message'].apply(len)
data['num_words']=data['message'].apply(lambda x:len(nltk.word_tokenize(x)))
data['num_sentences']=data['message'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
data[data['label']=='ham'][['num_characters','num_words','num_sentences']].describe()

In [None]:
data[data['label']=='spam'][['num_characters','num_words','num_sentences']].describe()

***We can clearly see the spam messages are quite longer than the ham messages.***

In [None]:
plt.figure(figsize=(10,4))
sns.histplot(data[data['label']=='ham']['num_characters'],color='green')
sns.histplot(data[data['label']=='spam']['num_characters'],color = 'red')

In [None]:
plt.figure(figsize=(10,4))
sns.histplot(data[data['label']=='ham']['num_words'],color='green')
sns.histplot(data[data['label']=='spam']['num_words'],color='red')

In [None]:
!pip install wordcloud
from wordcloud import WordCloud

In [None]:
ham_msg_text = data[data.label == 'ham'].message
spam_msg_text = data[data.label == 'spam'].message

In [None]:
plt.figure(figsize = (10, 12))

wc = WordCloud(width = 1500, height = 900, max_words = 2500).generate(" ".join(ham_msg_text))
plt.imshow(wc, interpolation='bilinear')

In [None]:
plt.figure(figsize = (10, 12))

wc = WordCloud(width = 1500, height = 900, max_words = 2500).generate(" ".join(spam_msg_text))
plt.imshow(wc, interpolation='bilinear')

# **PREPROCESSING**

In [None]:
def remove_punc(text):
  trans = str.maketrans('', '', string.punctuation)
  return text.translate(trans)
def remove_noise(text):
  t = re.sub('[^a-zA-Z]', ' ', text)
  return t

data['message'] = data['message'].apply(remove_punc)
data['message'] = data['message'].apply(remove_noise)

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
def remove_sws(text):
  s = [word.lower() for word in text.split() if word.lower() not in sw]
  return " ".join(s)

data['message'] = data['message'].apply(remove_sws)

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemma(text):
  l = [lemmatizer.lemmatize(word) for word in text.split()]
  return " ".join(l)

data['message'] = data['message'].apply(lemma)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder =LabelEncoder()

data['label']=encoder.fit_transform(data['label'])
data = data[['label','message']]

# **CONVERT WORDS TO VECTORS**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=3000)

X = tf.fit_transform(data['message']).toarray()
Y = data['label']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 32)

# **MAKE THE MODEL**

In [None]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

clf = BernoulliNB()
clf.fit(X_train, Y_train)

In [None]:
Y_pred = clf.predict(X_test)

In [None]:
print("Accuracy Score: ", accuracy_score(Y_test, Y_pred))

In [None]:
print(classification_report(Y_test, Y_pred))

# **SAVE THE MODEL**

In [None]:
import pickle
pickle.dump(tf,open('vectorizer.pkl','wb'))
pickle.dump(clf,open('model.pkl','wb'))

# **RUN MODEL**

In [None]:
! pip install streamlit -q
!wget -q -O - ipv4.icanhazip.com
! streamlit run App.py & npx localtunnel --port 8501