In [3]:
import numpy as np
import pandas as pd

In [4]:
# Read the data from the CSV file
data=pd.read_csv('SPAM.csv',encoding = "ISO-8859-1")
data.head(5)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Map 'spam' and 'ham' labels to binary values
data['spam']=data['type'].map({'spam':1,'ham':0}).astype(int)
data.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
# Tokenize the text
def tokenizer(text):
    return text.split()

data['text'] = data['text'].apply(tokenizer)

In [7]:
data['text'][0]

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet...',
 'Cine',
 'there',
 'got',
 'amore',
 'wat...']

In [8]:
# Perform stemming using SnowballStemmer
from nltk.stem.snowball import SnowballStemmer
port_it = SnowballStemmer("english", ignore_stopwords=False)

def stem_it(text):
    return [port_it.stem(word) for word in text]

data['text'] = data['text'].apply(stem_it)

In [9]:
data['text'][0]

['go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'avail',
 'onli',
 'in',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet...',
 'cine',
 'there',
 'got',
 'amor',
 'wat...']

In [10]:
# Perform lemmatization using WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_it(text):
    return [lemmatizer.lemmatize(word, pos="a") for word in text]

data['text'] = data['text'].apply(lemmatize_it)

In [11]:
data['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [13]:
# Remove stopwords using NLTK's stopwords corpus
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

def remove_stopwords(text):
    return [word for word in text if not word in stop_words]

data['text'] = data['text'].apply(remove_stopwords)

In [14]:
data['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [16]:
# Perform TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = data.spam.values
x = tfidf.fit_transform(' '.join(text) for text in data['text'])

In [17]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.2, shuffle=False)

In [18]:
# Train a logistic regression classifier
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression()

In [19]:
# Make predictions on the test set
y_pred = clf.predict(x_test)

In [20]:
# Calculate the accuracy of the classifier
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test) * 100
print("Accuracy:", accuracy)

Accuracy: 87.5
