In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
df = pd.read_csv("spam.csv", encoding="latin1")
df = df.rename(columns={'v1': 'type', 'v2': 'text'})
df = df[["type", "text"]]
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
print(df.isnull().sum())
df = df.drop_duplicates()

type    0
text    0
dtype: int64


In [22]:
df['type'].value_counts()

type
ham     4516
spam     653
Name: count, dtype: int64

In [23]:
df['type'] = df['type'].map({"ham": 0, "spam": 1})
df.head()

Unnamed: 0,type,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
# string lowercasing
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'].head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: text, dtype: object

In [25]:
import string
# remove punctuation
def remove_punctuation(text):
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

df['text'] = df['text'].apply(remove_punctuation)

In [26]:
def remove_numbers(text):
    new = ""
    for char in text:
        if not char.isdigit():
            new += char
    return new
    
df['text'] = df['text'].apply(remove_numbers)

In [27]:
import re

def clean_specials(text):
    text = re.sub(r"http\S+|www\S+", " ", text)   # remove URLs
    text = re.sub(r"\S+@\S+", " ", text)          # remove emails
    return text

df['text'] = df['text'].apply(clean_specials)
print(df['text'].head(10))

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in  a wkly comp to win fa cup final...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
5    freemsg hey there darling its been  weeks now ...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile  months or more u r entitled t...
Name: text, dtype: object


In [28]:
# remove extra whitespace
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True).str.strip()
print(df['text'].head(10))

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in a wkly comp to win fa cup final ...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
5    freemsg hey there darling its been weeks now a...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile months or more u r entitled to...
Name: text, dtype: object


In [29]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_wrds(text):
    words = text.split()
    new = ""
    for i in words:
        if i not in stop_words:
            new += i + " "
    return new

df['text'] = df['text'].apply(remove_wrds)

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['type'], test_size=0.2, random_state=42)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vectorizer = CountVectorizer()
x_train_bow = bow_vectorizer.fit_transform(X_train).toarray()
x_test_bow = bow_vectorizer.transform(X_test).toarray()

tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
x_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# Logistic Regression
lr = LogisticRegression()
lr.fit(x_train_bow, y_train)
lr_pred = lr.predict(x_test_bow)
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
scores_lr = cross_val_score(lr, x_train_bow, y_train, cv=5, scoring='accuracy')
print("Logistic Regression CV Scores:", scores_lr)
print("Mean CV Score:", scores_lr.mean())

# SVM
svm = LinearSVC()
svm.fit(x_train_bow, y_train)
svm_pred = svm.predict(x_test_bow)
print("\nSVM Accuracy:", accuracy_score(y_test, svm_pred))
scores_svm = cross_val_score(LinearSVC(), x_train_bow, y_train, cv=5, scoring='accuracy')
print("SVM CV Scores:", scores_svm)
print("Mean CV Score:", scores_svm.mean())

# decision tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train_bow, y_train)
dt_pred = dt.predict(x_test_bow)
print("\nDecision Tree Accuracy:", accuracy_score(y_test, dt_pred))
scores_dt = cross_val_score(dt, x_train_bow, y_train, cv=5, scoring='accuracy')
print("Decision Tree CV Scores:", scores_dt)
print("Mean CV Score:", scores_dt.mean())



Logistic Regression Accuracy: 0.9787234042553191
Logistic Regression CV Scores: [0.97218863 0.97339782 0.96977025 0.98065296 0.97460701]
Mean CV Score: 0.9741233373639661

SVM Accuracy: 0.9787234042553191
SVM CV Scores: [0.96977025 0.9758162  0.97702539 0.98065296 0.97339782]
Mean CV Score: 0.9753325272067714

Decision Tree Accuracy: 0.9477756286266924
Decision Tree CV Scores: [0.96614268 0.96130593 0.96493349 0.95646917 0.95405079]
Mean CV Score: 0.9605804111245465


In [34]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
     
logistic_model = LogisticRegression()

logistic_model.fit(X_train_tfidf,y_train)
log_pred = logistic_model.predict(X_test_tfidf)

print(accuracy_score(y_test,log_pred ))

0.9545454545454546
