In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import numpy as np

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# https://www.kaggle.com/datasets/team-ai/spam-text-message-classification

In [3]:
df = pd.read_csv('spam-ham.csv')

In [4]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
def clear_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^ a-z]', '', text)
    while text.find('  ') != -1:
        text = text.replace('  ', ' ')
    return text

df['Message']= df['Message'].apply(lambda x:clear_text(x))
df.head()

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [6]:
df['Message'] = df['Message'].apply(lambda x: word_tokenize(x))
df.head()

Unnamed: 0,Category,Message
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,ham,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [7]:
stop_words = stopwords.words('english')

def remove_stop_words(text):
    type(text)
    filtered_text = []
    for w in text:
        if w not in stop_words:
            filtered_text.append(w)
    return filtered_text

df['Message'] = df['Message'].apply(lambda x:remove_stop_words(x))
df.head()

Unnamed: 0,Category,Message
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, dont, think, goes, usf, lives, around, t..."


In [8]:
def to_string(l):
  text = ''
  for i in l:
    text += i + ' '
  return text
df['Message'] = df['Message'].apply(lambda x:to_string(x))

In [9]:
df['Category']=df['Category'].replace('spam',1)
df['Category']=df['Category'].replace('ham',0)

In [10]:
# lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

# def goodstemmer(text): 
#     return [stemmer.stem(t) for t in text]

# df['Message'] = df['Message'].apply(lambda x:goodstemmer(x))
# df.head()

In [11]:
df.head()

Unnamed: 0,Category,Message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though


In [12]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df['Message'])
y = df['Category']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=0.8,random_state=0)

In [13]:
model = GaussianNB()
model.fit(X_train.todense(), Y_train)
y_pred= model.predict(X_test.todense())
print(classification_report(Y_test, y_pred))




              precision    recall  f1-score   support

           0       0.98      0.89      0.93       955
           1       0.58      0.89      0.71       160

    accuracy                           0.89      1115
   macro avg       0.78      0.89      0.82      1115
weighted avg       0.92      0.89      0.90      1115



