# SPAM DETECTION PROJECT

In [1]:
# import the necessary modules
import nltk
import numpy as np
import pandas as pd

# read dataset
df = pd.read_csv("spam.csv", encoding = "latin1") # encoding was added to enable readablility after encountering multiple error instances

# Explore The Dataset

`** First, display the first 5 values, find missing values, find more information, the shape of the dataset  and 5 number summary if it applies to the dataset **`

In [2]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [5]:
df.shape

(5572, 5)

In [6]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

`** Comparing the two codes above, we can see that the unnamed columns have >95% of missing values, hence can be dropped **`

In [7]:
# dropping the 3 unnamed columns from the dataset (optional to get the column names first)
data = df.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])

In [8]:
# check out the new dataset
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# find the unique values in the v1 column
data["v1"].unique()

array(['ham', 'spam'], dtype=object)

`** The V1 olumn has only ham and spam as the unique values **`

In [10]:
# find the total number of duplicates in the dataset
duplicates = data.duplicated().sum()

In [11]:
data = data.drop_duplicates (keep = "first")
data.shape

(5169, 2)

`** We can see that the number of rows in the dataset has reduced from 5572 to 5169 after the duplicates were dropped ** `

# Data PreProcessing

In [12]:
# first rename columns to give it a suitable name
data.rename(columns={"v1": "type", "v2": "text"}, inplace=True)
data.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
import re

from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Function to clean a tweet by removing URLs, mentions, hashtags, punctuation,
    converting text to lowercase, and removing stopwords.
    """
    if isinstance (text, str):# check if the text is a string
        text = re.sub(r"http\S+|www\S+|http\S+", "", text, flags=re.MULTILINE) #remove urls
        text = re.sub(r"\@\w+|\#", "", text) # remove any hashtags or mentions that could be present 
        text = re.sub(r"[^\w\s]", "", text) # remove anu punctuations
        text = text.lower() # convert characters to lowercases
        text = " " .join([word for word in text.split() if word not in stop_words]) # to remove stopwords
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nuell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# applying the clean text function to the dataset
data["clean_text"] = data["text"].apply(clean_text)

# confirm if the new change was applied to the dataset
data.head(30)

Unnamed: 0,type,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darling 3 weeks word back id like ...
6,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent
7,ham,As per your request 'Melle Melle (Oru Minnamin...,per request melle melle oru minnaminunginte nu...
8,spam,WINNER!! As a valued network customer you have...,winner valued network customer selected receiv...
9,spam,Had your mobile 11 months or more? U R entitle...,mobile 11 months u r entitled update latest co...


In [22]:
# check if there are missing values still in the dataset, esp the cleaned one
data.isnull().sum()

type          0
text          0
clean_text    0
dtype: int64

`** Since there are no null values, no filling of null values will be done **`