In [159]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from sklearn.metrics import precision_recall_fscore_support

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ashwanth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [119]:
data = pd.read_csv('./data/email.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [120]:
data['Category'].value_counts()

Category
ham               4825
spam               747
{"mode":"full"       1
Name: count, dtype: int64

In [121]:
data = data[data['Category'].isin(['ham', 'spam'])]
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [122]:
duplicate_rows = data.duplicated()
num_duplicates = duplicate_rows.sum()
print(num_duplicates)

415


In [123]:
data=data.drop_duplicates()
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [125]:

def preprocessed_text(text):
    # Lowercase
    text = text.lower()
    
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # 
    processed_text = ' '.join(tokens)
    
    return processed_text

# Data Preprocessing 

In [126]:
data=data.dropna()

data['Message'] = data['Message'].apply(preprocessed_text)
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

In [127]:
data


Unnamed: 0,Category,Message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah dont think goe usf live around though
...,...,...
5567,1,2nd time tri 2 contact u u £750 pound prize 2 ...
5568,0,ü b go esplanad fr home
5569,0,piti mood soani suggest
5570,0,guy bitch act like id interest buy someth els ...


# Vectorization

In [128]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Message'])
y = data['Category']

print("Unique labels in y_train:", y.unique())
print("Data type of y_train:", y.dtype)

Unique labels in y_train: [0 1]
Data type of y_train: int64


# Data split

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training

In [141]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Model Evaluation

In [151]:
y_pred = classifier.predict(X_test_dense)

p, r, f,_= precision_recall_fscore_support(y_test, y_pred,pos_label=1,average='binary')
print('precision:',p)
print('recall:',r)
print('f1_score:',f)

precision: 0.5276595744680851
recall: 0.9117647058823529
f1_score: 0.6684636118598383


# Testing the model with example

In [158]:
preprocessed_sentence = preprocess_text("you won lottery click on the link")
sentence_vector = vectorizer.transform([preprocessed_sentence])
predicted_class = classifier.predict(sentence_vector)
 
if predicted_class[0] == 1:
    print("The sentence is classified as spam.")
else:
    print("The sentence is not classified as spam.")


The sentence is classified as spam.
