# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")

# Reading Dataset

In [None]:
df = pd.read_csv(r'../input/sms-spam-collection-dataset/spam.csv', encoding = 'latin-1')
df.head()

In [None]:
#Find Missing Values
plt.figure(figsize=(12,8))
sns.heatmap(df.isnull(), cmap = 'viridis', yticklabels = False, cbar = False)

In [None]:
df.dropna(how="any", inplace=True, axis=1)
df.columns = ['label', 'message']
df.head()

# Exploratory Data Analysis

In [None]:
df.describe()

In [None]:
df.groupby('label').describe()

In [None]:
sns.countplot(df['label'])

It is an Imbalaanced Dataset, so f1_score will be the best metric for evaluation

In [None]:
# convert label to a numerical variable
df['label_num'] = df.label.map({'ham':0, 'spam':1})
df.head()

## Visualizing Most Repeated Words using WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
def word_cloud(data, title=None):
    cloud = WordCloud(background_color = 'black',
                     stopwords = stopwords,
                     max_words = 200,
                     max_font_size = 40,
                     scale = 3).generate(str(data))
    fig = plt.figure(figsize=(15,15))
    plt.axis('off')
    if title:
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.25)
        plt.imshow(cloud)
        plt.show()

In [None]:
word_cloud(df[df['label_num']==1]['message'],'Most Repeated words in spam messages')

In [None]:
word_cloud(df[df['label_num']==0]['message'],'Most Repeated words in Ham messages')

## Number of characters in a sms

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
sms_len=df[df['label_num']==1]['message'].str.len()
ax1.hist(sms_len,color='red')
ax1.set_title('spam messages')
sms_len=df[df['label_num']==0]['message'].str.len()
ax2.hist(sms_len,color='green')
ax2.set_title('Ham messages')
fig.suptitle('Characters in sms')

Through just basic EDA we've been able to discover a trend that spam messages tend to have more characters.

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
sms_words = df[df['label_num']==1]['message'].str.split().map(lambda x: len(x))
ax1.hist(sms_words, color='red')
ax1.set_title('Spam Messages')
sms_words = df[df['label_num']==0]['message'].str.split().map(lambda x: len(x))
ax2.hist(sms_words, color='green')
ax2.set_title('Ham Messages')
fig.suptitle('Words in a Sms')

In [None]:
df[df.label=='ham'].describe()

In [None]:
df[df.label=='spam'].describe()

In [None]:
import string
from nltk.corpus import stopwords

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

In [None]:
df.head()

In [None]:
df['clean_msg'] = df.message.apply(text_process)
df.head()

In [None]:
X = df.clean_msg
y = df.label_num
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1, stratify = y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Model Building Using Pipelines

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.pipeline import Pipeline
pipeline_lr=Pipeline([('bow1', CountVectorizer(analyzer=text_process)),
                      ('tfidf1', TfidfTransformer()),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [None]:
pipeline_dt=Pipeline([('bow2', CountVectorizer(analyzer=text_process)),
                      ('tfidf2', TfidfTransformer()),
                     ('dt_classifier',DecisionTreeClassifier())])

In [None]:
pipeline_rf=Pipeline([('bow3', CountVectorizer(analyzer=text_process)),
                      ('tfidf3', TfidfTransformer()),
                     ('rf_classifier',RandomForestClassifier())])

In [None]:
pipeline_nb=Pipeline([('bow4', CountVectorizer(analyzer=text_process)),
                      ('tfidf4', TfidfTransformer()),
                     ('naive_classifier',MultinomialNB())])

In [None]:
pipeline_svm=Pipeline([('bow5', CountVectorizer(analyzer=text_process)),
                      ('tfidf5', TfidfTransformer()),
                     ('svm_classifier',SVC())])

In [None]:
## Lets make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_nb, pipeline_svm]

In [None]:
best_score=0.0
best_classifier=0
best_pipeline=""

In [None]:
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'Random Forest', 3: 'Naive-Baies Classifier', 4: 'SVM Classifier'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score
for i,model in enumerate(pipelines):
    predictions = model.predict(X_test)
    score = f1_score(y_test, predictions, average='macro')
    print("{} Test F1_Score: {}".format(pipe_dict[i], score))

In [None]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_score:
        best_score=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best f1_score:{}'.format(pipe_dict[best_classifier]))