In [1]:
# Data manipulation
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from scipy.stats import skew
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_table('smsspamcollection.tsv')

In [3]:
data.shape

(5572, 4)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   length   5572 non-null   int64 
 3   punct    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.2+ KB


In [5]:
X = data['message']
y = data.label

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
X = vector.fit_transform(X).toarray()

In [13]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=3, train_size=0.3)

In [14]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report

def select_model_predict(model):
    
    # Training the model
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    
    #Checking Bias-variance tradeoff
    train_accuracy = model.score(xtrain, ytrain)
    test_accuracy = model.score(xtest, ytest)
    
    #Converting Accuracy score into dataframe
    dataframe = pd.DataFrame(
        [train_accuracy*100, test_accuracy*100], 
        index=['Train Accuracy', 'Test Accuracy'], 
        columns=['Value'])
    print(dataframe)
    
    # Classification Report for the model
    report = classification_report(ytest, ypred)
    print(report)
    
    return model

In [15]:
gnb = select_model_predict(GaussianNB())
mnb = select_model_predict(MultinomialNB())
bnb = select_model_predict(BernoulliNB())

                    Value
Train Accuracy  97.067624
Test Accuracy   91.796975
              precision    recall  f1-score   support

         ham       0.98      0.92      0.95      3371
        spam       0.64      0.89      0.75       530

    accuracy                           0.92      3901
   macro avg       0.81      0.91      0.85      3901
weighted avg       0.94      0.92      0.92      3901

                    Value
Train Accuracy  98.982645
Test Accuracy   97.974878
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99      3371
        spam       0.95      0.90      0.92       530

    accuracy                           0.98      3901
   macro avg       0.97      0.95      0.96      3901
weighted avg       0.98      0.98      0.98      3901

                    Value
Train Accuracy  94.673848
Test Accuracy   93.258139
              precision    recall  f1-score   support

         ham       0.93      1.00      0.96      3371
   