In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from sklearn.metrics import precision_recall_fscore_support

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dersk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_csv('./data/email.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [3]:
data['Category'].value_counts()

ham               4825
spam               747
{"mode":"full"       1
Name: Category, dtype: int64

In [4]:
data = data[data['Category'].isin(['ham', 'spam'])]
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
duplicate_rows = data.duplicated()
num_duplicates = duplicate_rows.sum()
print(num_duplicates)

415


In [6]:
data=data.drop_duplicates()
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:

def preprocessed_text(text):
    # Lowercase
    text = text.lower()
    
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # 
    processed_text = ' '.join(tokens)
    
    return processed_text

# Data Preprocessing 

In [8]:
data=data.dropna()

data['Message'] = data['Message'].apply(preprocessed_text)
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

In [9]:
print(data)

      Category                                            Message
0            0  go jurong point crazi avail bugi n great world...
1            0                              ok lar joke wif u oni
2            1  free entri 2 wkli comp win fa cup final tkt 21...
3            0                u dun say earli hor u c alreadi say
4            0          nah dont think goe usf live around though
...        ...                                                ...
5567         1  2nd time tri 2 contact u u £750 pound prize 2 ...
5568         0                            ü b go esplanad fr home
5569         0                            piti mood soani suggest
5570         0  guy bitch act like id interest buy someth els ...
5571         0                                     rofl true name

[5157 rows x 2 columns]


# Vectorization

In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Message'])
y = data['Category']

print("Unique labels in y_train:", y.unique())
print("Data type of y_train:", y.dtype)

Unique labels in y_train: [0 1]
Data type of y_train: int64


# Cross Validation

In [17]:
# Cross validation:
def cross_validate(X, y, num_iter=10):
    results = {'precision': [], 'recall': [], 'f1': []}
    for i in range(num_iter):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        y_pred = MultinomialNB_model(X_train, y_train, X_test)
        p, r, f,_= precision_recall_fscore_support(y_test, y_pred, pos_label=1, average='binary')
        results['precision'].append(p)
        results['recall'].append(r)
        results['f1'].append(f)
    return results
            

# Model

In [14]:
def MultinomialNB_model(X_train, y_train, X_test):
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    return y_pred

# Data split

# Training

# Model Evaluation

In [26]:
results = cross_validate(X, y)
print('precision scores:', results["precision"])
print('recall scores:', results["recall"])
print('f1 scores:', results["f1"])
print()
print('precision average score:', sum(results["precision"]) / len(results["precision"]))
print('recall average score:', sum(results["recall"]) / len(results["recall"]))
print('f1 average score:', sum(results["f1"]) / len(results["f1"]))

precision scores: [0.9057971014492754, 0.8794326241134752, 0.8518518518518519, 0.8175675675675675, 0.8776978417266187, 0.8702290076335878, 0.8666666666666667, 0.8648648648648649, 0.8571428571428571, 0.8296296296296296]
recall scores: [0.9259259259259259, 0.9393939393939394, 0.905511811023622, 0.8962962962962963, 0.9104477611940298, 0.95, 0.9512195121951219, 0.920863309352518, 0.9767441860465116, 0.9655172413793104]
f1 scores: [0.9157509157509157, 0.9084249084249084, 0.8778625954198473, 0.8551236749116607, 0.8937728937728938, 0.9083665338645418, 0.9069767441860465, 0.89198606271777, 0.9130434782608695, 0.8924302788844621]

precision average score: 0.8620880012646396
recall average score: 0.9341919982807274
f1 average score: 0.8963738086193915


# Testing the model with example

In [27]:
preprocessed_sentence = preprocessed_text("you won lottery click on the link")
sentence_vector = vectorizer.transform([preprocessed_sentence])
predicted_class = classifier.predict(sentence_vector)
 
if predicted_class[0] == 1:
    print("The sentence is classified as spam.")
else:
    print("The sentence is not classified as spam.")


NameError: name 'classifier' is not defined