In [1]:
# imports
from tensorflow.keras.utils import set_random_seed
set_random_seed(42)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import strip_numeric
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# loading dataset
df = pd.read_csv('/content/spam_dataset.csv')
df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# getting info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   target   5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
# mapping values in target
df['target'] = df['target'].map({'ham':0, 'spam':1})
df['target'].head()

0    0
1    0
2    1
3    0
4    0
Name: target, dtype: int64

In [5]:
# checking whether dataset is imbalanced
df['target'].value_counts()

0    4825
1     747
Name: target, dtype: int64

In [6]:
# creating X & y
X = df['message']
y = df['target']

In [7]:
# removing digits
X = X.apply(strip_numeric)

In [8]:
# splitting data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=18)

In [9]:
# getting shapes of splits
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3900,)
(3900,)
(1672,)
(1672,)


In [10]:
# setting stopwords
stopwords = list(stopwords.words('english'))

In [11]:
# tfidf
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [12]:
# getting summary of vectors; size & content
print('Vocabulary size:', len(tfidf.vocabulary_))
print('Vocabulary content:', tfidf.vocabulary_)

Vocabulary size: 2921


In [13]:
# making df of vectors
pd.DataFrame(data=X_train.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,____,aathi,abi,ability,abiola,abj,able,absolutly,abt,abta,...,yummy,yun,yup,zed,zoe,åð,ì_,ìï,û_,ûò
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.446282,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.599649,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
3896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
3897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
3898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


In [14]:
# before sampling count
print('Before sampling:', Counter(y_train))

Before sampling: Counter({0: 3367, 1: 533})


In [15]:
# handling imbalance
sampler = SMOTE()
X_train, y_train = sampler.fit_resample(X_train, y_train) 

In [16]:
# after sampling count
print('After sampling:', Counter(y_train))

After sampling: Counter({0: 3367, 1: 3367})


In [17]:
# model training 
model = XGBClassifier(random_state=18)
model.fit(X_train, y_train)

In [18]:
# checking accuracy
print('Training Accuracy:', (accuracy_score(y_train, model.predict(X_train)) * 100).round(2))
print('Testing Accuracy:', (accuracy_score(y_test, model.predict(X_test)) * 100).round(2))

Training Accuracy: 99.64
Testing Accuracy: 97.19


In [19]:
# cross validation
score = cross_val_score(model, X_train, y_train)
print('CV score:', (np.mean(score) * 100).round(2))

CV score: 98.11


In [20]:
# confusion matrix
confusion_matrix(y_test, model.predict(X_test))

array([[1444,   14],
       [  33,  181]])

In [21]:
# clf report
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1458
           1       0.93      0.85      0.89       214

    accuracy                           0.97      1672
   macro avg       0.95      0.92      0.93      1672
weighted avg       0.97      0.97      0.97      1672

