# Notebook Setup

In [103]:
# Data
import pandas as pd
import numpy as np

# ML/DL
import tensorflow as tf
import tensorflow.keras as k

# Technical
import os
import time
import typing
from IPython.display import clear_output

# Data Import

In [104]:
df = pd.read_csv(r"data/spam.csv", encoding='cp1252')
print(df.shape)
df.head()

(5572, 5)


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


For textual data, we don't have many samples to deal with. It also looks like commas in the SMSs were misinterpreted as .csv separators, let's fix that.

In [105]:
# Join unintentionally separated columns
df['Unnamed: 2'] = df['Unnamed: 2'].fillna("")
df['Unnamed: 3'] = df['Unnamed: 3'].fillna("")
df['Unnamed: 4'] = df['Unnamed: 4'].fillna("")
df['v2']=pd.concat([df['v2'],df['Unnamed: 2'],df['Unnamed: 3'],df['Unnamed: 4']], ignore_index=True)

# Rename columns
df = df[['v1', 'v2']].set_axis(['label', 'text'], axis=1)

# Make labels 0/1
df['label'] = df['label'].map({'ham':0,'spam':1})

In [106]:
df['label'].mean()

0.13406317300789664

13% of our data is spam.

# Preprocessing

Since we are dealing with textual data, we will use Spacy to preprocess our data.

In [117]:
lemma = False
stop = False

df=df2.copy()
import spacy
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
import re
nlp = en_core_web_sm.load()

# Basic preprocessing
df['text'] = df['text'].apply(lambda x: ''.join(ch for ch in x if (ch.isalnum()) | (ch == ' ') | (ch == "'"))) # keep only alphanumeric characters
df['text'] = df['text'].replace(' +', ' ').str.lower().str.strip() # no double spaces and lower case only and stripped leading/trailing spaces

if lemma and stop:
    df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))
elif lemma:
    df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    
df.to_csv(r"data/data_clean.csv")