In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
with open('spam.csv', 'r', encoding='ISO-8859-1') as file:
    df1 = pd.read_csv(file)
    
df1.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df2 = df1[["v1","v2"]]

In [4]:
df2.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
df2.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
df2["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

### As the proportion is uneven, so we need to stratify the distribution of spam in both training and testing set equally

# Text Preprocessing For NLP

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

2023-12-02 08:38:02.789270: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
# Preprocess the text

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

In [13]:
df2["v2"] = df2["v2"].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["v2"] = df2["v2"].apply(preprocess_text)


In [14]:
df2.head()

Unnamed: 0,v1,v2
0,ham,jurong point crazy available bugis n great wor...
1,ham,ok lar joke wif u oni
2,spam,free entry wkly comp win FA Cup final tkts tex...
3,ham,u dun early hor u c
4,ham,nah think go usf live


In [16]:
# Splitting the dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df2["v2"], df2["v1"], test_size = 0.2, random_state=1111, stratify=df2["v1"]) 

In [23]:
y_train.head()

1933     ham
5255     ham
5535     ham
2188    spam
2497     ham
Name: v1, dtype: object

In [17]:
X_train.head()

1933                                          r u scratch
5255                                       ok Sweet dream
5535    know thinkin malaria relax child not handle ma...
2188    free camera phone linerental month cross ntwk ...
2497                                   Dai da send resume
Name: v2, dtype: object

## TF-IDF Vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Defining a vectorizer function
def tfidf_vectorize(train_data, test_data = None):
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    
    if test_data is not None:
        X_test_tfidf = tfidf_vectorizer.transform(test_data)
        return X_train_tfidf, X_test_tfidf, tfidf_vectorizer
    else: 
        return X_train_tfidf, tfidf_vectorizer

In [19]:
X_train_tfidf, X_test_tfidf, tfidf_vectorizer = tfidf_vectorize(X_train, X_test)

## Training the model

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))

Accuracy: 0.97

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
[[966   0]
 [ 34 115]]


## So the accuracy was 97%.