In [40]:
# Importing common packages

import os, sys, random
import re, string, contractions
import nltk, sklearn
import csv

from collections import Counter
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

### Loading data

Reading from the file to a single list

In [41]:
data_file = open("../data/SMSSpamCollection")
sms_data_list = data_file.readlines()

Splitting the list into a list of [label, data]

In [42]:
sms_labelled_data = [[i.split("\t")[0], i.split("\t")[1].split("\n")[0]] for i in sms_data_list]
print(sms_labelled_data[0])

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']


Defining a function to get words from the sms

In [43]:
def get_words(sent):
    sent = re.sub(r'^https?:\/\/.*[\r\n]*', '', sent, flags=re.MULTILINE) # Remove urls starting with http
    sent = re.sub(r'^http?:\/\/.*[\r\n]*', '', sent, flags=re.MULTILINE) # Remove urls starting with https
    sent = contractions.fix(sent, slang=True) # Replace contractions with words
    sent = ''.join([i for i in sent if not i.isdigit()]) # Remove numbers
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_words = tokenizer.tokenize(sent) # Remove all punctuation marks (don't have to worry about contractions)
    return tokenized_words

Defining a function to return a vocabulary (with number of occurences) upon given an sms as input

In [44]:
def get_tokenized_sms(sms):
    words_list = get_words(sms)
    
    stopwords = list(nltk.corpus.stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    temp_list = []

    for word in words_list:
        if len(word) > 1 and word.lower() not in stopwords:
            word = lemmatizer.lemmatize(word.lower())
            temp_list.append(word.lower())

    return temp_list

Using the defined functions to convert the labelled list of sms into a labelled list of vocabularies

In [45]:
sms_labels = []
sms_words = []

for item in sms_labelled_data:
    word_list = get_tokenized_sms(item[1])

    sms_labels.append(item[0])
    sms_words.append(word_list)

sms_labels[0], sms_words[0]

('ham',
 ['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'])

Defining a function to save the labels and words as a csv file

In [46]:
def save_csv(filename, x_data, y_data):
    combined_data = []
    
    for i in range(len(y_data)):
        temp = []
        temp.append(y_data[i])
        temp = temp + x_data[i]
        combined_data.append(temp)

    with open(filename, 'w') as f:
        write = csv.writer(f)
        write.writerows(combined_data)

#### Combining all the above preprocessing steps into one function to save a raw_data.csv file from the given text file

In [47]:
def get_csv_from_txt(file_path):
    data_file = open(file_path)
    sms_data_list = data_file.readlines()
    sms_labelled_data = [[i.split("\t")[0], i.split("\t")[1].split("\n")[0]] for i in sms_data_list]

    sms_labels = []
    sms_words = []

    for item in sms_labelled_data:
        word_list = get_tokenized_sms(item[1])

        sms_labels.append(item[0])
        sms_words.append(word_list)

    save_csv("../data/raw_data.csv", sms_words, sms_labels)

    return sms_labels, sms_words

sms_labels, sms_words = get_csv_from_txt("../data/SMSSpamCollection")

#### Loading the raw_data into lists

Defining a function to load data from csv to lists

In [48]:
def get_list_from_csv(file_path):
    with open(file_path, newline='') as f:
        reader = csv.reader(f)
        sms_words = list(reader)

    sms_labels = [x[0] for x in sms_words]

    for x in sms_words:
        del x[0]

    return sms_labels, sms_words

#### Splitting the data into train/validation/test datasets

Defining a function to split data into train/val/test sets and saving as csv

In [49]:
def train_val_test_split(data_x, data_y, val_per, test_per, tr_path, val_path, te_path, random_seed):
    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=test_per, shuffle=True, random_state=random_seed)

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_per/(1-test_per), shuffle=True, random_state=random_seed)

    save_csv(tr_path, x_train, y_train)
    save_csv(val_path, x_val, y_val)
    save_csv(te_path, x_test, y_test)

Defining file paths for train/val/test csv files

In [50]:
train_path = "../data/train_data.csv"
val_path = "../data/val_data.csv"
test_path = "../data/test_data.csv"

Splitting data using random seed 42

In [51]:
train_val_test_split(sms_words, sms_labels, 0.15, 0.15, train_path, val_path, test_path, 42)

#### Tracking the split csv using dvc

Initialising dvc

In [52]:
!cd .. && dvc init

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

Add all three csv files to dvc

In [53]:
!dvc add ../data/train_data.csv
!dvc add ../data/val_data.csv
!dvc add ../data/test_data.csv

[2K[32m⠋[0m Checking graph                                                   [32m⠋[0m Checking graph
Adding...                                                                       
[31mERROR[39m:  output '../data/train_data.csv' is already tracked by SCM (e.g. Git).
    You can remove it from Git, then add to DVC.
        To stop tracking from Git:
            git rm -r --cached '../data/train_data.csv'
            git commit -m "stop tracking ../data/train_data.csv" 
[?25l                                                                          [32m⠋[0m Checking graph
Adding...                                                                       
[31mERROR[39m:  output '../data/val_data.csv' is already tracked by SCM (e.g. Git).
    You can remove it from Git, then add to DVC.
        To stop tracking from Git:
            git rm -r --cached '../data/val_data.csv'
            git commit -m "stop tracking ../data/val_data.csv" 
[?25l                                      

Adding google drive folder as a remote data storage

In [54]:
# !cd .. && dvc remote add --default myremote gdrive://1MypipdcBtjmYnO3OQQmLxKM3SWfwmE2p