### Goal: Classify messages as spam or ham (not spam)

In [47]:
# import statements
import pandas as pd
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')

import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#### 1. Load data

In [35]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
print(df["label"].value_counts())
print("\n Null values: ", df.isnull().sum())

label
ham     4825
spam     747
Name: count, dtype: int64

 Null values:  label      0
message    0
dtype: int64


#### 2. Preprocessing


Tasks in Preprocessing:
* Lowercasing
* Removing punctuation
* Removing stopwords
* Tokenization
* Stemming (or Lemmatization)

In [37]:
ps = PorterStemmer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):

    # Lowercase
    text = text.lower()

    # # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and stem
    cleaned = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)
    
df['cleaned_message'] = df['message'].apply(preprocess_text)
df.head()

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


#### 3. Text Vectorization

1. Encode Labels (ham → 0, spam → 1)
2. Split the data
3. Vectorization (BoW Vectorizer)

In [50]:
le = LabelEncoder()
df["label_num"] = le.fit_transform(df["label"])

x = df["cleaned_message"]
y = df["label_num"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)



#### Try with BoW

In [51]:
cv = CountVectorizer()
x_train_vec = cv.fit_transform(x_train)
x_test_vec = cv.transform(x_test)

In [52]:
model = MultinomialNB()
model.fit(x_train_vec, y_train)

In [53]:
y_pred = model.predict(x_test_vec)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification report:\n ", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.9838565022421525
Classification report:
                precision    recall  f1-score   support

           0       0.99      0.99      0.99       980
           1       0.95      0.92      0.93       135

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion matrix:
 [[973   7]
 [ 11 124]]


#### Try with TF_IDF

In [48]:
tf = TfidfVectorizer()
x_train_tf = tf.fit_transform(x_train)
x_test_tf = tf.transform(x_test)

model = MultinomialNB()
model.fit(x_train_tf, y_train)

In [49]:
y_pred = model.predict(x_test_tf)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification report:\n ", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.9542600896860987
Classification report:
                precision    recall  f1-score   support

           0       0.95      1.00      0.97       943
           1       1.00      0.70      0.83       172

    accuracy                           0.95      1115
   macro avg       0.97      0.85      0.90      1115
weighted avg       0.96      0.95      0.95      1115

Confusion matrix:
 [[943   0]
 [ 51 121]]


#### Try with Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

# with BoW vectors
model.fit(x_train_vec, y_train)
y_pred = model.predict(x_test_vec)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification report:\n ", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.9802690582959641
Classification report:
                precision    recall  f1-score   support

           0       0.98      1.00      0.99       980
           1       1.00      0.84      0.91       135

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion matrix:
 [[980   0]
 [ 22 113]]


In [59]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train_tf, y_train)
y_pred = model.predict(x_test_tf)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification report:\n ", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.8789237668161435
Classification report:
                precision    recall  f1-score   support

           0       0.88      1.00      0.94       980
           1       0.00      0.00      0.00       135

    accuracy                           0.88      1115
   macro avg       0.44      0.50      0.47      1115
weighted avg       0.77      0.88      0.82      1115

Confusion matrix:
 [[980   0]
 [135   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
