In [1]:
# Installing necessary dependecies
!pip install pycaret

import pandas as pd # Pandas dataFrame
import numpy as np # Linear algebra 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from pycaret.utils import enable_colab
enable_colab()

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/a2/7b/70e41d8aa900ed47e0e2ac6a8f5cbaf9e359efdf8ae10bf89502c14ce3ed/pycaret-2.2.3-py3-none-any.whl (249kB)
[K     |█▎                              | 10kB 17.0MB/s eta 0:00:01[K     |██▋                             | 20kB 11.0MB/s eta 0:00:01[K     |████                            | 30kB 8.6MB/s eta 0:00:01[K     |█████▎                          | 40kB 8.0MB/s eta 0:00:01[K     |██████▌                         | 51kB 5.2MB/s eta 0:00:01[K     |███████▉                        | 61kB 5.9MB/s eta 0:00:01[K     |█████████▏                      | 71kB 5.9MB/s eta 0:00:01[K     |██████████▌                     | 81kB 6.5MB/s eta 0:00:01[K     |███████████▉                    | 92kB 6.4MB/s eta 0:00:01[K     |█████████████                   | 102kB 6.6MB/s eta 0:00:01[K     |██████████████▍                 | 112kB 6.6MB/s eta 0:00:01[K     |███████████████▊                | 122kB 6.6MB/s eta 0

In [2]:
# Import dataset from the link below
df = pd.read_csv('https://raw.githubusercontent.com/PannaD8ta/Spam_Classifier_NLP_PyCaret/master/SMS_Spam_Ham_Raw.csv', sep=',')

# Checks the shape of the data
df.columns

Index(['Flag', 'SMS '], dtype='object')

In [3]:
df.rename(columns={'SMS ':'SMS'}, inplace=True)

In [4]:
display(df.head())
display(df.tail())

Unnamed: 0,Flag,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Unnamed: 0,Flag,SMS
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will Ã¼ b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...
5573,ham,Rofl. Its true to its name


In [5]:
stop_words = stopwords.words('english')

# Import nlp module
from pycaret.nlp import *

# Setting up Environment
# 'df' is a pandas Dataframe and 'SMS' is the name of the column containing text
nlp = setup(data = df, target = 'SMS', custom_stopwords=stop_words, session_id=123)

Description,Value
session_id,123
Documents,5574
Vocab Size,4194
Custom Stopwords,True


In [6]:
# Use of LDA to classify texts in a document to a particular topic - builds a topic per dcocument model and words per topic model.
lda = create_model(model='lda', multi_core=True)

In [7]:
lda_data = assign_model(lda)

In [8]:
lda_data.head()


Unnamed: 0,Flag,SMS,Topic_0,Topic_1,Topic_2,Topic_3,Dominant_Topic,Perc_Dominant_Topic
0,ham,go point crazy available get,0.042327,0.045308,0.043763,0.868603,Topic 3,0.87
1,ham,,0.25,0.25,0.25,0.25,Topic 0,0.25
2,spam,free entry wkly final may text receive entry q...,0.442082,0.262769,0.018066,0.277083,Topic 0,0.44
3,ham,say early hor already say,0.041715,0.042337,0.874085,0.041863,Topic 2,0.87
4,ham,go life around though,0.050465,0.051494,0.336137,0.561903,Topic 3,0.56


In [9]:
evaluate_model(lda)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [10]:
# Credit: https://towardsdatascience.com/topic-modeling-articles-with-nmf-8c6b2a227a45
# Non-Negative Matrix Factorization (NMF)
nmf = create_model(model='nmf', multi_core=True)

In [11]:
nmf_data = assign_model(nmf)

**Note:** Next, drop columns (SMS, Dominant Topics, and Perc_Dominant_Topic) for both LDA and NMF dataframes

In [12]:
lda_data.columns

Index(['Flag', 'SMS', 'Topic_0', 'Topic_1', 'Topic_2', 'Topic_3',
       'Dominant_Topic', 'Perc_Dominant_Topic'],
      dtype='object')

In [13]:
lda_data.drop(['SMS', 'Dominant_Topic', 'Perc_Dominant_Topic'], axis=1, inplace=True)
lda_data.head()

Unnamed: 0,Flag,Topic_0,Topic_1,Topic_2,Topic_3
0,ham,0.042327,0.045308,0.043763,0.868603
1,ham,0.25,0.25,0.25,0.25
2,spam,0.442082,0.262769,0.018066,0.277083
3,ham,0.041715,0.042337,0.874085,0.041863
4,ham,0.050465,0.051494,0.336137,0.561903


In [14]:
nmf_data.columns

Index(['Flag', 'SMS', 'Topic_0', 'Topic_1', 'Topic_2', 'Topic_3',
       'Dominant_Topic', 'Perc_Dominant_Topic'],
      dtype='object')

In [15]:
nmf_data.drop(['SMS', 'Dominant_Topic', 'Perc_Dominant_Topic'], axis=1, inplace=True)
nmf_data.head()

Unnamed: 0,Flag,Topic_0,Topic_1,Topic_2,Topic_3
0,ham,0.000664,0.049586,0.002396,0.002886
1,ham,0.0,0.0,0.0,0.0
2,spam,0.001406,0.000484,0.000686,0.001194
3,ham,0.001593,0.003458,0.002559,0.004287
4,ham,0.0,0.059114,0.0,0.0


**Stage 3a:** Model Building (LDA)




In [16]:
from pycaret.classification import *

In [18]:
pce1 = setup(data = lda_data, target = 'Flag', session_id = 321, train_size = 0.85)

Unnamed: 0,Description,Value
0,session_id,321
1,Target,Flag
2,Target Type,Binary
3,Label Encoded,"ham: 0, spam: 1"
4,Original Data,"(5574, 5)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [19]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9367,0.9404,0.731,0.802,0.7634,0.727,0.7289,1.903
rf,Random Forest Classifier,0.9354,0.9399,0.737,0.7872,0.7604,0.7231,0.7242,0.933
lightgbm,Light Gradient Boosting Machine,0.9344,0.9371,0.7055,0.8024,0.7499,0.7124,0.7149,0.117
et,Extra Trees Classifier,0.9299,0.9338,0.716,0.7684,0.7403,0.6999,0.7011,0.583
catboost,CatBoost Classifier,0.9289,0.9376,0.6601,0.7991,0.7212,0.6809,0.686,3.351
gbc,Gradient Boosting Classifier,0.9215,0.929,0.6102,0.7819,0.684,0.64,0.6473,0.526
dt,Decision Tree Classifier,0.9168,0.8365,0.74,0.692,0.7133,0.6648,0.6666,0.034
ada,Ada Boost Classifier,0.9113,0.9135,0.5588,0.7481,0.6348,0.586,0.5967,0.221
knn,K Neighbors Classifier,0.9063,0.8975,0.5951,0.6925,0.6388,0.5855,0.5884,0.126
ridge,Ridge Classifier,0.8856,0.0,0.2569,0.79,0.384,0.3386,0.4059,0.02


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=321, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)

**Stage 4a:** Hyperparameter Tuning (LDA)

In [20]:
# Step 1: Model Creating using the recommended model above
pce1_model = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9325,0.9374,0.6667,0.8148,0.7333,0.6951,0.6997
1,0.9346,0.9307,0.7121,0.7966,0.752,0.7145,0.716
2,0.9262,0.9413,0.7273,0.7385,0.7328,0.69,0.69
3,0.9325,0.8969,0.7121,0.7833,0.746,0.7072,0.7083
4,0.9241,0.938,0.6667,0.7586,0.7097,0.6662,0.668
5,0.9451,0.9792,0.8358,0.7887,0.8116,0.7795,0.78
6,0.9304,0.9328,0.7164,0.7742,0.7442,0.704,0.7047
7,0.9323,0.9628,0.7273,0.7742,0.75,0.7109,0.7114
8,0.9535,0.9471,0.8333,0.8333,0.8333,0.8063,0.8063
9,0.9429,0.9332,0.7727,0.8095,0.7907,0.7577,0.7579


In [21]:
# Step 2: Model tuning
pce1_tuned = tune_model(pce1_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9051,0.9094,0.4697,0.7561,0.5794,0.5292,0.5483
1,0.9114,0.8907,0.4394,0.8529,0.58,0.5361,0.5731
2,0.8966,0.9022,0.4242,0.7179,0.5333,0.4795,0.5005
3,0.9093,0.8864,0.4697,0.7949,0.5905,0.5432,0.5671
4,0.9346,0.896,0.5758,0.9268,0.7103,0.6757,0.7
5,0.9114,0.9353,0.4925,0.8049,0.6111,0.5644,0.5861
6,0.903,0.9155,0.5075,0.7234,0.5965,0.5433,0.5543
7,0.9091,0.9023,0.5,0.7674,0.6055,0.5567,0.573
8,0.9112,0.9279,0.5303,0.7609,0.625,0.5765,0.5885
9,0.9175,0.9106,0.5152,0.8293,0.6355,0.5919,0.6132


In [22]:
# Step 3: Obtaining insights from model performance
evaluate_model(pce1_tuned)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

**Stage 3b:** Model Building (NMF)


In [23]:
pce2 = setup(data = nmf_data, target = 'Flag', session_id = 321, train_size = 0.85)

Unnamed: 0,Description,Value
0,session_id,321
1,Target,Flag
2,Target Type,Binary
3,Label Encoded,"ham: 0, spam: 1"
4,Original Data,"(5574, 5)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [24]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9333,0.9367,0.6708,0.8205,0.7369,0.6992,0.7046,0.637
xgboost,Extreme Gradient Boosting,0.9322,0.9184,0.686,0.8039,0.7376,0.6991,0.7036,4.571
rf,Random Forest Classifier,0.932,0.9312,0.6753,0.8098,0.7342,0.6957,0.7008,1.038
lightgbm,Light Gradient Boosting Machine,0.9278,0.9226,0.6708,0.7847,0.7217,0.6806,0.6843,0.121
catboost,CatBoost Classifier,0.9179,0.9238,0.5876,0.7758,0.6668,0.621,0.6299,3.517
gbc,Gradient Boosting Classifier,0.9084,0.9118,0.5138,0.7557,0.6096,0.56,0.5745,0.503
dt,Decision Tree Classifier,0.9014,0.8029,0.6888,0.638,0.6605,0.6031,0.6049,0.034
ada,Ada Boost Classifier,0.8993,0.8933,0.4608,0.7205,0.5598,0.5063,0.5236,0.221
knn,K Neighbors Classifier,0.8928,0.8823,0.5468,0.6359,0.5866,0.5255,0.5283,0.134
lr,Logistic Regression,0.8602,0.7841,0.0,0.0,0.0,0.0,0.0,0.307


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=321, verbose=0,
                     warm_start=False)

**Stage 4b:** Hyperparameter Tuning (NMF)


In [25]:
# Step 1: Model Creating using the recommended model above
pce2_model = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9388,0.9259,0.6515,0.8776,0.7478,0.7139,0.7241
1,0.9367,0.9176,0.7121,0.8103,0.7581,0.7218,0.7238
2,0.9409,0.9373,0.7121,0.8393,0.7705,0.7369,0.7401
3,0.9304,0.9135,0.6061,0.8511,0.708,0.6697,0.6822
4,0.9219,0.9497,0.6818,0.7377,0.7087,0.6637,0.6644
5,0.9388,0.9508,0.7015,0.8393,0.7642,0.7294,0.7333
6,0.9156,0.9227,0.5672,0.7755,0.6552,0.6084,0.6181
7,0.9281,0.9354,0.6515,0.7963,0.7167,0.676,0.6804
8,0.9471,0.966,0.7576,0.8475,0.8,0.7697,0.7713
9,0.9345,0.9486,0.6667,0.8302,0.7395,0.7025,0.7081


In [26]:
# Step 2: Model tuning
pce2_tuned = tune_model(pce2_model, optimize='AUC')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.884,0.9188,0.697,0.5679,0.6259,0.558,0.5621
1,0.8565,0.8723,0.6818,0.4891,0.5696,0.4863,0.496
2,0.8776,0.8986,0.7576,0.5435,0.6329,0.5619,0.573
3,0.8776,0.8848,0.6515,0.5513,0.5972,0.5257,0.5282
4,0.8418,0.8941,0.7121,0.4563,0.5562,0.4655,0.4826
5,0.8861,0.9031,0.7463,0.5747,0.6494,0.5827,0.5898
6,0.8671,0.8698,0.7463,0.5208,0.6135,0.5363,0.5489
7,0.871,0.894,0.697,0.5287,0.6013,0.5261,0.5333
8,0.8816,0.9128,0.7879,0.5532,0.65,0.5814,0.5945
9,0.8858,0.9166,0.7879,0.5652,0.6582,0.5919,0.6037


In [27]:
# Step 3: Obtaining insights from model performance
evaluate_model(pce2_tuned)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

**Stage 5:** Save Experiment

In [28]:
save_model(pce1_tuned, 'Experiment_321(lda) 10Jan2021')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Flag',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strateg...
                  RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                         class_weight={}, criterion='gini',
                                         max_depth=3, max_features='log2',
                                         max_leaf_

In [29]:
save_model(pce2_tuned, 'Experiment_321(nmf) 10Jan2021')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Flag',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strateg...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight='balanced_subsample',
                                       criterion='entropy', max_depth=10,
                                       max_features=1.0