## Import libaries

In [1]:
import pandas as pd # to handle dataframes/tables

In [2]:
# load dataset
# 'encoding' parameter is used to handle special characters in the dataset
df_raw = pd.read_csv('spam.csv', encoding='latin-1')

# display first 5 rows of the dataset
df_raw.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# remove unnecessary columns
df_raw = df_raw[['v1', 'v2']]

# rename columns for better understanding
df_raw.columns = ['label', 'message']

df_raw.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# copy the dataframe to a new variable for cleaning
df_clean = df_raw.copy()

In [5]:
# function to clean and tokenize the text data
def clean_text(text):
    # convert text to lowercase
    text = text.lower()
    # remove punctuation and special characters
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    # tokenize the text into words
    tokens = text.split()
    return tokens


In [6]:
# apply the cleaning function to the 'message' column
df_clean['tokens'] = df_clean['message'].apply(clean_text)

# display the cleaned dataframe
df_clean.head()

Unnamed: 0,label,message,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."


In [7]:
# view size of the cleaned dataframe
df_clean.shape

(5572, 3)