![](https://raw.githubusercontent.com/divyanshugit/spam-analyzer/master/static/img.png)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:black; background:#D4E6E5; border:0' role="tab" aria-controls="home"><center>Quick Navigation</center></h3>

* [1. About](#0)  
    
* [2. Data Preprcocessing ](#2)

* [3. Encoding ](#3)

* [4. Model Selection ](#4)
    
* [5. Saving Model](#5)
    
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:white; background:#112; border:0' role="tab" aria-controls="home"><center>Do Upvote it if you find it interesting/useful 🤗</center></h3>

<a id="0"></a>

<font color='cadetblue'>About:</font>
---
In this EDA Walkthrough you will get to know about how to preprocess and visualize the data. This kernel covers basics of building a **spam analyzer(filter) model**.

You can checkout the original source of the dataset from this [link](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

<font color='red'>Hoping that you will love it.</font>
---

In [None]:
import string
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import wordcloud
from PIL import Image
from collections import Counter

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

<a id="2"></a>
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:black; background:#D4E6E5;border:0' role="tab" aria-controls="home"><center>Data Preprocessing</center></h3>

In [None]:

df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")

In [None]:
df.head()

In [None]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v1":"label", "v2":"message"})
df.head()

In [None]:
df.info()

In [None]:
df.groupby("label").describe()

<h4>From this we are able to identify this that the dataset contains 4825 ham and 747 spam messages and for both classes, some messages appear more than once.<h4>

In [None]:
df.label.value_counts().plot.bar()

Labeling for Classifiers:
---
Machine Learning Algorithms are functional operator they execute fuctional mathematical operations on their inputs and generate numerical oputputs as a result. So, We need to provide them a label to interpret the data for classification of messages.


In [None]:
df['spam'] = df['label'].map( {'spam': 1, 'ham': 0} ).astype(int)
df.head()

In [None]:
df["length"] = df["message"].apply(len)
df.head()

In [None]:
ham_df = df[df["spam"]==0]
spam_df = df[df["spam"]==1]

In [None]:
def word_cloud(df, title):
    """
    `word_cloud` is function which helps to generate word_cloud of text
    """
    text = ' '.join(df['message'].astype(str).tolist())
    stopwords = set(wordcloud.STOPWORDS)
    facecolor = '#353b47'
    fig_wordcloud = wordcloud.WordCloud(width=1000, height=600, background_color="black",stopwords=stopwords,
            max_font_size = 160, margin=0).generate(text)
    
    plt.figure(figsize=(10,6), facecolor=facecolor)
    plt.imshow(fig_wordcloud)
    plt.axis('off') 
    plt.title(title, fontsize=35,color="cadetblue" )
    plt.tight_layout(pad=2)

In [None]:
word_cloud(ham_df, "Ham Messages")

In [None]:
word_cloud(spam_df,"Spam Messages")

In [None]:
def ps_remover(input_data):
    """
    ps_remover is a function which helps to remove the punctuation and stopwords
    from the text.
    """
    list_of_text_without_punctuation = [ch for ch in input_data if ch not in string.punctuation]
    text_without_punctuation = "".join(list_of_text_without_punctuation).split()
    
    text_without_puctuation_and_stopword = \
            [word.lower() for word in text_without_punctuation if word.lower() not in stopwords.words("english")]
        
    return text_without_puctuation_and_stopword


In [None]:
ham_df.loc[:, 'message'] = ham_df['message'].apply(ps_remover)
ham_words = ham_df['message'].tolist()
spam_df.loc[:, 'message'] = spam_df['message'].apply(ps_remover)
spam_words = spam_df['message'].tolist()

In [None]:
ham_words_list = []
for sublist in ham_words:
    for item in sublist:
        ham_words_list.append(item)
c_ham  = Counter(ham_words_list)
top30_ham_words  = pd.DataFrame(c_ham.most_common(30),  columns=['word', 'count'])
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='word', y='count', data=top30_ham_words, ax=ax)
plt.title("Top 30 Ham words")
plt.xticks(rotation='vertical')

In [None]:
spam_words_list = []
for sublist in spam_words:
    for item in sublist:
        spam_words_list.append(item)
c_spam = Counter(spam_words_list)
top30_spam_words= pd.DataFrame(c_spam.most_common(30), columns=['word', 'count'])
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='word', y='count', 
            data=top30_spam_words, ax=ax)
plt.title("Top 30 Spam words")
plt.xticks(rotation='vertical');

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(20,10))
word=df[df['spam']==1]['message'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('Spam')
word=df[df['spam']==0]['message'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Ham')
fig.suptitle('Average word length in each text')

<a id="3"></a>
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:black; background:#D4E6E5; border:0' role="tab" aria-controls="home"><center>Encoding</center></h3>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer = ps_remover).fit(df['message'])

In [None]:
print(len(bow_transformer.vocabulary_))

In [None]:
bow_data = bow_transformer.transform(df['message'])

In [None]:
bow_data.shape

In [None]:
bow_data.nnz

In [None]:
print( bow_data.nnz / (bow_data.shape[0] * bow_data.shape[1]) *100 )

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer().fit(bow_data)

In [None]:
sample_ham = df["message"][4]
bow_sample_ham = bow_transformer.transform([sample_ham])
tfidf_sample_ham = tfidf_transformer.transform(bow_sample_ham)
print(tfidf_sample_ham)

In [None]:
sample_spam = df["message"][8]
bow_sample_spam = bow_transformer.transform([sample_spam])
tfidf_sample_spam = tfidf_transformer.transform(bow_sample_spam)
print(tfidf_sample_spam)

In [None]:
data_tfidf = tfidf_transformer.transform(bow_data)

In [None]:
data_tfidf

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
tfidf_train_data, tfidf_test_data, label_train, label_test = \
    train_test_split(data_tfidf, df["spam"], test_size=0.25, random_state=5)

In [None]:
tfidf_train_data
tfidf_test_data

In [None]:
from scipy.sparse import  hstack
from sklearn.preprocessing import MinMaxScaler

In [None]:
X2 = hstack((data_tfidf ,np.array(df['length'])[:,None])).A
X2_train, X2_test, y2_train, y2_test = \
    train_test_split(X2, df["spam"], test_size=0.25, random_state=5)

tfidf_train_data = tfidf_train_data.A
tfidf_test_data = tfidf_test_data.A

scaler = MinMaxScaler()
tfidf_train_sc_data = scaler.fit_transform(tfidf_train_data)
tfidf_test_sc_data  = scaler.transform(tfidf_test_data)

X2_tfidf_train = X2_train[:,0:9400]
X2_tfidf_test  = X2_test[:,0:9400]
X2_length_train = X2_train[:,9400]
X2_length_test  = X2_test[:,9400]

scaler = MinMaxScaler()
X2_tfidf_train = scaler.fit_transform(X2_tfidf_train)
X2_tfidf_test  = scaler.transform(X2_tfidf_test)

scaler = MinMaxScaler()
X2_length_train = scaler.fit_transform(X2_length_train.reshape(-1, 1))
X2_length_test  = scaler.transform(X2_length_test.reshape(-1, 1))

X2_train = np.hstack((X2_tfidf_train, X2_length_train))
X2_test  = np.hstack((X2_tfidf_test,  X2_length_test))

<a id="4"></a>
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:black; background:#D4E6E5; border:0' role="tab" aria-controls="home"><center>Model Selection</center></h3>

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [None]:
models = []
models.append(("LR",LogisticRegression()))
models.append(("NB",GaussianNB()))
models.append(("MNB",MultinomialNB()))
models.append(("Dtree",DecisionTreeClassifier()))
models.append(("KNN",KNeighborsClassifier()))

In [None]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [None]:
for name,model in models:
    kfold = KFold(n_splits=5)
    cv_result = cross_val_score(model,X2_train, y2_train, cv = kfold,scoring ="accuracy")
    print(name, cv_result)

In [None]:
model_MNB = MultinomialNB().fit(X2_train, y2_train)

<a id="7"></a>
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style=' color:black; background:#D4E6E5; border:0' role="tab" aria-controls="home"><center>Saving Models</center></h3>

In [None]:
import pickle
pickle.dump(model_MNB,open('model.pkl','wb'))