In [118]:
# Importing common packages

import os, sys, random
import re, string, contractions
import nltk, sklearn
import csv

from collections import Counter
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

### Loading data

Reading from the file to a single list

In [119]:
data_file = open("../data/SMSSpamCollection")
sms_data_list = data_file.readlines()

Splitting the list into a list of [label, data]

In [120]:
sms_labelled_data = [[i.split("\t")[0], i.split("\t")[1].split("\n")[0]] for i in sms_data_list]
print(sms_labelled_data[0])

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']


Defining a function to get words from the sms

In [121]:
def get_words(sent):
    sent = re.sub(r'^https?:\/\/.*[\r\n]*', '', sent, flags=re.MULTILINE) # Remove urls starting with http
    sent = re.sub(r'^http?:\/\/.*[\r\n]*', '', sent, flags=re.MULTILINE) # Remove urls starting with https
    sent = contractions.fix(sent, slang=True) # Replace contractions with words
    sent = ''.join([i for i in sent if not i.isdigit()]) # Remove numbers
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_words = tokenizer.tokenize(sent) # Remove all punctuation marks (don't have to worry about contractions)
    return tokenized_words

Defining a function to return a vocabulary (with number of occurences) upon given an sms as input

In [122]:
def get_tokenized_sms(sms):
    words_list = get_words(sms)
    
    stopwords = list(nltk.corpus.stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    temp_list = []

    for word in words_list:
        if len(word) > 1 and word.lower() not in stopwords:
            word = lemmatizer.lemmatize(word.lower())
            temp_list.append(word.lower())

    return temp_list

Using the defined functions to convert the labelled list of sms into a labelled list of vocabularies

In [123]:
sms_labels = []
sms_words = []

for item in sms_labelled_data:
    word_list = get_tokenized_sms(item[1])

    sms_labels.append(item[0])
    sms_words.append(word_list)

sms_labels[0], sms_words[0]

('ham',
 ['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'])

Splitting the data into train/validation/test datasets

In [124]:
test_per = 0.15
val_per = 0.15

x_train, x_test, y_train, y_test = train_test_split(sms_words, sms_labels, test_size=test_per, shuffle=True)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_per/(1-test_per), shuffle=True)

print(len(x_train), len(x_val), len(x_test))

3901 836 837


Saving each dataset as a csv file with the first word on each line being the label for that datapoint

In [125]:
def save_csv(filename, x_data, y_data):
    combined_data = []
    
    for i in range(len(y_data)):
        temp = []
        temp.append(y_data[i])
        temp = temp + x_data[i]
        combined_data.append(temp)

    with open(filename, 'w') as f:
        write = csv.writer(f)
        write.writerows(combined_data)

In [126]:
save_csv("../data/train_data.csv", x_train, y_train)
save_csv("../data/val_data.csv", x_val, y_val)
save_csv("../data/test_data.csv", x_test, y_test)