In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from multiprocessing import Pool

In [2]:
df = pd.read_csv('Spam_SMS_Detection_Dataset/spam.csv', encoding='ISO-8859-1',)
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace =True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.columns = ['spam_value', 'SMS']
df.head()

Unnamed: 0,spam_value,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
print(df['spam_value'].value_counts())

spam_value
ham     4825
spam     747
Name: count, dtype: int64


In [7]:
df['spam_value']=df['spam_value'].map({'ham':0, 'spam': 1})
df.head()

Unnamed: 0,spam_value,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemme = WordNetLemmatizer()

stopWords = set(stopwords.words('english'))
punc = set(string.punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mskie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mskie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mskie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def process_sms(text):
    token = nltk.word_tokenize(text)
    token = [lemme.lemmatize(word.lower()) for word in token if word not in punc and word.lower() not in stopWords]
    clean_text = ' '.join(token)
    return clean_text

df['SMS'] = df['SMS'].apply(process_sms)

In [10]:
print(df.head())

   spam_value                                                SMS
0           0  go jurong point crazy .. available bugis n gre...
1           0                    ok lar ... joking wif u oni ...
2           1  free entry 2 wkly comp win fa cup final tkts 2...
3           0        u dun say early hor ... u c already say ...
4           0            nah n't think go usf life around though


In [11]:
vectorizer = TfidfVectorizer()

X= vectorizer.fit_transform(df["SMS"])
Y = df["spam_value"]

In [12]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 47919 stored elements and shape (5572, 8114)>
  Coords	Values
  (0, 3366)	0.15520006241648393
  (0, 4110)	0.3501488358489382
  (0, 5556)	0.23908902803852988
  (0, 2226)	0.2711661702168244
  (0, 1273)	0.26189990994192225
  (0, 1692)	0.2958071514722134
  (0, 3442)	0.19342240984703865
  (0, 7940)	0.23472906775355987
  (0, 4228)	0.2958071514722134
  (0, 1690)	0.3342549120557338
  (0, 1969)	0.2958071514722134
  (0, 3405)	0.1624828014633748
  (0, 1051)	0.3501488358489382
  (0, 7739)	0.19564182742080344
  (1, 5175)	0.27211951321382544
  (1, 4261)	0.4082988561907181
  (1, 4080)	0.5236458071582338
  (1, 7852)	0.4316010362639011
  (1, 5202)	0.5465881710238072
  (2, 3189)	0.11567418757722363
  (2, 2807)	0.3610126061987564
  (2, 7902)	0.1910856844514371
  (2, 2078)	0.1950385539618722
  (2, 7864)	0.14662823828170074
  (2, 2939)	0.4710862612423468
  :	:
  (5567, 165)	0.3404219056787515
  (5567, 5094)	0.3404219056787515
  (5568, 3684)	0.33

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75, stratify=Y)

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
my_models = {
    "Logistic Regression" : LogisticRegression(random_state=42), 
    "Naive Bayes" : MultinomialNB(),
    "SVM" : SVC(random_state=42)
}

parameters = {
    "Logistic Regression" : {
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10],
        'class_weight': [None, 'balanced']
    }, 

    "Naive Bayes" : {
        'alpha': [0.1, 0.2, 0.5, 1.0, 2.0],
        'fit_prior': [True, False],
        'class_prior': [None]
    },

    "SVM" : {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [2, 3, 4], 
        'gamma': ['scale', 'auto'],
        'coef0': [0, 0.1, 0.5],
        'probability': [True, False],
        'class_weight': [None, 'balanced']
    }
}

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for my_model_name, model in my_models.items():
    print(f"running for {my_model_name}")
    grid_search = GridSearchCV(estimator=model, param_grid=parameters[my_model_name], cv=3, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, Y_train)

    if my_model_name == 'Logistic Regression':
        best_model_lg = grid_search.best_estimator_

    elif my_model_name == "Naive Bayes":
        best_model_dt = grid_search.best_estimator_
        
    elif my_model_name == "SVM":
        best_model_svm = grid_search.best_estimator_

running for Logistic Regression




running for Naive Bayes
running for SVM


In [16]:
model_lg = best_model_lg
model_lg.fit(X_train, Y_train)
y_lg = model_lg.predict(X_test)

print('For Logistic Regression:')
print('accuracy: ', accuracy_score(Y_test, y_lg))
print('ROC AUC score', roc_auc_score(Y_test, y_lg))
print('Classification report', classification_report(Y_test, y_lg))

For Logistic Regression:
accuracy:  0.9777458722182341
ROC AUC score 0.9216307056517767
Classification report               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1206
           1       0.99      0.84      0.91       187

    accuracy                           0.98      1393
   macro avg       0.98      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393





In [17]:
model_nb = best_model_dt
model_nb.fit(X_train, Y_train)
y_nb = model_nb.predict(X_test)

print('For Naive Bayes:')
print('accuracy: ', accuracy_score(Y_test, y_nb))
print('ROC AUC score', roc_auc_score(Y_test, y_nb))
print('Classification report', classification_report(Y_test, y_nb))

For Naive Bayes:
accuracy:  0.9877961234745154
ROC AUC score 0.968100673105063
Classification report               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1206
           1       0.97      0.94      0.95       187

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [18]:
model_svm = best_model_svm
model_svm.fit(X_train, Y_train)
y_svm = model_svm.predict(X_test)

print('For SVM:')
print('accuracy: ', accuracy_score(Y_test, y_svm))
print('ROC AUC score', roc_auc_score(Y_test, y_svm))
print('Classification report', classification_report(Y_test, y_svm))

For SVM:
accuracy:  0.9698492462311558
ROC AUC score 0.8899597378526264
Classification report               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1206
           1       0.99      0.78      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393

