In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_csv('spam.csv',encoding='latin-1')

In [None]:
df.sample(5)

In [None]:
df.shape

## 1. Data Cleaning

In [None]:
df.info()

In [None]:
#dropping the last 3 columns where data is missing
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
#renaming columns
df.rename(columns={'v1':'target','v2':'text'},inplace=True)

In [None]:
df.sample(5)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
df['target']=encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
#Checking for nulls
df.isnull().sum()

In [None]:
#checking duplicates
df.duplicated().sum()

In [None]:
#removing duplicates
df=df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## 2. Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
plt.bar(labels=['ham','spam'],data=df['target'].value_counts(),color="yellow")
plt.xlabel("Types")
plt.ylabel("frequency")
plt.show()

In [None]:
plt.bar('ham','spam',data=df['target'].value_counts(),color="yellow")
plt.xlabel("Types")
plt.ylabel("frequency")
plt.show()

In [None]:
#data imbalance

In [None]:
#natural Language Toolkit
import nltk

In [None]:
df['text']

In [None]:
#pandas function apply
df['text'].apply(len)

In [None]:
df['character_num']=df['text'].apply(len)

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#num of words
df['text'].apply(lambda x:nltk.word_tokenize)

In [None]:
df['text'].apply(lambda x:nltk.word_tokenize(x))

In [None]:
df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df['word_num']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.sample()

In [None]:
df['text'].apply(lambda x:nltk.sent_tokenize(x))

In [None]:
df['text'].apply(lambda s:nltk.sent_tokenize(s))

In [None]:
df['text'].apply(lambda s:len(nltk.sent_tokenize(s)))

In [None]:
df['sent_num']=df['text'].apply(lambda s:len(nltk.sent_tokenize(s)))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df[['character_num',"word_num",'sent_num']].describe()

In [None]:
df.describe()

In [None]:
[df['target']==0]

In [None]:
df[df['target']==0]

In [None]:
#regarding ham messages
df[df['target']==0][['character_num','word_num','sent_num']].describe()

In [None]:
#regarding spam messagaes
df[df['target']==1][["character_num","word_num","sent_num"]].describe()

In [None]:
df[df['target']==1][['character_num',"word_num","sent_num"]]

In [None]:
import seaborn as sea

In [None]:
plt.figure(figsize=(12,6))
sea.histplot(df[df['target']==0]['character_num'])
sea.histplot(df[df['target']==1]['character_num'],color="red")

In [None]:
plt.figure(figsize=(12,6))
sea.histplot(df[df['target']==0]['word_num'])
sea.histplot(df[df['target']==1]['word_num'],color="red")

In [None]:
df.info()

In [None]:
plt.figure(figsize=[12,6])
sea.histplot(df[df['target']==0]['sent_num'])
sea.histplot(df[df['target']==1]['sent_num'],color="red")

In [None]:
plt.figure(figsize=(12,7))
sea.histplot(df[df['target']==0][['character_num','word_num','sent_num']])

In [None]:
plt.figure(figsize=(12,6))
g=sea.histplot(df[df['target']==0][['character_num','word_num','sent_num']])
g.set(xlim=(0,200))
g.show()

In [None]:
plt.figure(figsize=(12,7))
sea.histplot(df[df['target']==0][['character_num','word_num','sent_num']])
g=sea.histplot(df[df['target']==1][['character_num','word_num','sent_num']],color="blue")
g.set(xlim=(0,200))

In [None]:
#We can figure out the relation between different parameters and can check for the outliers which may spoil the system
sea.pairplot(df,hue='target')

In [None]:
df.corr()

In [None]:
#Examining the relation between variables using an heat map
sea.heatmap(df.corr())

In [None]:
#with annotations
sea.heatmap(df.corr(),annot=True)

## 3. Text Preprocessing

In [None]:
#Stemming
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
from nltk.corpus import stopwords
stopwords.words("English")

In [None]:
import string
string.punctuation

In [None]:
def transform_text(text):
    #Converting into lower case
    text=text.lower()
    text=nltk.word_tokenize(text)
    y=[]
    #Checking for numericals and alphabets
    for i in text:
        if i.isalnum():
            y.append(i)
            
    #Removing StopWords and Punctuation
    text=y[:]
    y.clear()
    for i in text:
        if i not in string.punctuation and i not in stopwords.words("English"):
            y.append(i)
            
    #Stemming
    text=y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
transform_text("!Hi $hoW aRe, YOU?")

In [None]:
transform_text("Hey! Did you liked my ML project? I actually loved it:)")

In [None]:
transform_text("But i'll b going 2 shut sch on mon. My sis need 2 take smth.")

In [None]:
df['text'][2000]

In [None]:
#Stemming
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
ps.stem("Adorable")

In [None]:
#applying the function we created on the whole text messages!
df['text'].apply(transform_text)

In [None]:
#saving the transformed text in another column
df['transformed_text']=df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
df['transformed_text'][2500]

In [None]:
#importing wordcloud
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc=wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(12,6))
plt.imshow(spam_wc)

In [None]:
ham_wc=wc.generate(df[df['target']==0]["transformed_text"].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(12,6))
plt.imshow(ham_wc)

In [None]:
spam_corpus=[]
for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
from collections import Counter
sea.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[0],pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation=90)
plt.show()

In [None]:
ham_corpus=[]
for msg in df[df['target']==0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)
#length is more as ham messages are also more

In [None]:
from collections import Counter
sea.barplot(pd.DataFrame(Counter(ham_corpus).most_common(30))[0],pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

## 4.Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()
tfidf=TfidfVectorizer(max_features=3000)

In [None]:
X=tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#scaler=MinMaxScaler()
#X=scaler.fit_transform(X)

In [None]:
X

In [None]:
X.shape

In [None]:
y=df['target'].values

In [None]:
y

In [None]:
df['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3=bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
#tfidf --> MNB(Multinomial Naive Bayes)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc=SVC(kernel='sigmoid', gamma=1.0) 
knc=KNeighborsClassifier()
mnb=MultinomialNB()
dtc=DecisionTreeClassifier(max_depth=5)
lrc=LogisticRegression(solver='liblinear', penalty='l1')
rfc=RandomForestClassifier(n_estimators=50, random_state=2) 
abc=AdaBoostClassifier(n_estimators=50, random_state=2)
bc=BaggingClassifier(n_estimators=50, random_state=2) 
etc=ExtraTreesClassifier(n_estimators=50, random_state=2) 
gbdt=GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb=XGBClassifier(n_estimators=50, random_state=2)

In [None]:
clfs={
    'SVC':svc,
    'KN':knc,
    'NB':mnb,
    'DT':dtc,
    'LR':lrc,
    'RF':rfc,
    'Adaboost':abc,
    'BgC':bc,
    'ETC':etc,
    'GBDT':gbdt,
    'XGB':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

In [None]:
accuracy_scores=[]
precision_scores=[]
for name,clf in clfs.items():
    current_accuracy,current_precision=train_classifier(clf,X_train,y_train,X_test,y_test)
    print("For ",name)
    print("Accuracy: ",current_accuracy)
    print("Precision: ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df=pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
performance_df1=pd.melt(performance_df,id_vars="Algorithm")

In [None]:
performance_df1

In [None]:
performance_df.merge(temp_df,on="Algorithm")

In [None]:
#model improvement
#1. Changing max_features parameter of TfIdf

In [None]:
temp_df=pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores})

In [None]:
performance_df.merge(temp_df,on='Algorithm')

In [None]:
import joblib
joblib.dump(tfidf,'vectorizer.pkl')

In [None]:
import sklearn
sklearn.__version__

In [None]:
joblib.dump(mnb,'model.pkl')

In [None]:
import joblib
tfidf = joblib.load(r'C:\Users\Asus\Desktop\GIT\codes\MLG\vectorizer.pkl')
model = joblib.load(r'C:\Users\Asus\Desktop\GIT\codes\MLG\model.pkl')

In [None]:
tfidf


In [None]:
model

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(stemmer.stem(i))

    return " ".join(y)

In [None]:
import streamlit as st
import string
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [None]:
st.title("E-mail/SMS Spam Classifier")

inputtxt = st.text_area("Enter the contents of the e-mail to detect:")

if st.button('Predict'):
    # 1. Preprocess
    transformed_text = transform_text(inputtxt)
    # 2. Vectorize
    vectored_text = tfidf.fit_transform([transformed_text])
    # 3. Predict
    result = model.predict(vectored_text)[0]
    # 4. Display
    if result == 1:
        st.header("Spam")
    else:
        st.header("Not-Spam")