# **1. Import necessary libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# **2. Import data**

In [None]:

df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df = df.replace(['ham','spam'],[0, 1])
df.columns = ['label', 'texts']
df['Count']=0
for i in np.arange(0,len(df.texts)):
    df.loc[i,'Count'] = len(df.loc[i,'texts'])


# **3. Visualizing the data**

In [None]:
df.describe()
#Statistical Summary of Data
from pandas import set_option
set_option('display.width', 100)
set_option('precision', 2)
print(df.shape)
print(df.describe())


df['label'].value_counts().plot(kind = 'pie', explode = [0, 0.1], figsize = (6, 6), autopct = '%1.1f%%', shadow = True)
plt.ylabel("Spam vs Ham")
plt.legend(["Ham", "Spam"])
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
df['label'].value_counts().plot(kind='bar', color = '#1f77b4', alpha=0.5, grid=True)
plt.ylabel('counts')
plt.title('ham vs spam counts')
plt.xticks(rotation='horizontal')
ax.xaxis.grid(which="major")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for p in ax.patches:
    ax.annotate(np.round(p.get_height(),decimals=2), (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center')




# **4. Preprocessing**


In [None]:
corpus = []
ps = PorterStemmer()#for normalization
for i in range(0, 5572):
    msg = df['texts'][i]
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'email', df['texts'][i])#Replace email addresses with 'email'
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'http', df['texts'][i])#Replace URLs with 'http'
    msg = re.sub('£|\$', 'money', df['texts'][i])#Replace money symbols with 'money` '
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', df['texts'][i])#Replace phone numbers with 'phonenumbr'
    msg = re.sub('\d+(\.\d+)?', 'numbr', df['texts'][i])#Replace numbers with 'numbr'
    msg = re.sub('[^\w\d\s]', ' ', df['texts'][i])#Remove all punctuations
    msg = msg.lower()# Each word to lower case
    msg = msg.split()# Splitting words to Tokenize
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))] # Stemming with PorterStemmer handling Stop Words
    msg = ' '.join(msg) # preparing Messages with Remaining Tokens
    corpus.append(msg) # Preparing WordVector Corpus
cv = CountVectorizer()#In order to use textual data for predictive modeling, the text must be ntegers, or floating. this used to convert a collection of text documents to a vector of term/token counts


# **5. Declare feature vector and target variable**

In [None]:
x = cv.fit_transform(corpus).toarray()
y = df['label']
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=1)

# **6. Naive Bayes Classifier**

In [None]:
GaussNB = GaussianNB()
GaussNB.fit(x_train,y_train)#fit the model with the training data
predict_Gauss = GaussNB.predict(x_test)# predict the target(spam or not)
print(accuracy_score(y_test, predict_Gauss))
print(confusion_matrix(y_test, predict_Gauss))
print (classification_report(y_test, predict_Gauss))

# **7. logistic Regression Classifier**

In [None]:
Logistic = LogisticRegression(class_weight='balanced')
Logistic.fit(x_train,y_train)
predict_Logistic = Logistic.predict(x_test)
print(accuracy_score(y_test, predict_Logistic))
print(confusion_matrix(y_test, predict_Logistic))
print (classification_report(y_test, predict_Logistic))