In [9]:
# Importing common packages

import os, sys, random
import re, string, contractions
import nltk, sklearn
import csv

from collections import Counter
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

### Loading data

Reading from the file to a single list

In [2]:
data_file = open("../data/SMSSpamCollection")
sms_data_list = data_file.readlines()

Splitting the list into a list of [label, data]

In [3]:
sms_labelled_data = [[i.split("\t")[0], i.split("\t")[1].split("\n")[0]] for i in sms_data_list]
print(sms_labelled_data[0])

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']


Defining a function to get words from the sms

In [4]:
def get_words(sent):
    sent = re.sub(r'^https?:\/\/.*[\r\n]*', '', sent, flags=re.MULTILINE) # Remove urls starting with http
    sent = re.sub(r'^http?:\/\/.*[\r\n]*', '', sent, flags=re.MULTILINE) # Remove urls starting with https
    sent = contractions.fix(sent, slang=True) # Replace contractions with words
    sent = ''.join([i for i in sent if not i.isdigit()]) # Remove numbers
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_words = tokenizer.tokenize(sent) # Remove all punctuation marks (don't have to worry about contractions)
    return tokenized_words

Defining a function to return a vocabulary (with number of occurences) upon given an sms as input

In [5]:
def get_tokenized_sms(sms):
    words_list = get_words(sms)
    
    stopwords = list(nltk.corpus.stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    temp_list = []

    for word in words_list:
        if len(word) > 1 and word.lower() not in stopwords:
            word = lemmatizer.lemmatize(word.lower())
            temp_list.append(word.lower())

    return temp_list

Using the defined functions to convert the labelled list of sms into a labelled list of vocabularies

In [6]:
sms_labels = []
sms_words = []

for item in sms_labelled_data:
    word_list = get_tokenized_sms(item[1])

    sms_labels.append(item[0])
    sms_words.append(word_list)

sms_labels[0], sms_words[0]

('ham',
 ['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'])

Defining a function to save the labels and words as a csv file

In [7]:
def save_csv(filename, x_data, y_data):
    combined_data = []
    
    for i in range(len(y_data)):
        temp = []
        temp.append(y_data[i])
        temp = temp + x_data[i]
        combined_data.append(temp)

    with open(filename, 'w') as f:
        write = csv.writer(f)
        write.writerows(combined_data)

#### Combining all the above preprocessing steps into one function to save a raw_data.csv file from the given text file

In [8]:
def get_csv_from_txt(file_path):
    data_file = open(file_path)
    sms_data_list = data_file.readlines()
    sms_labelled_data = [[i.split("\t")[0], i.split("\t")[1].split("\n")[0]] for i in sms_data_list]

    sms_labels = []
    sms_words = []

    for item in sms_labelled_data:
        word_list = get_tokenized_sms(item[1])

        sms_labels.append(item[0])
        sms_words.append(word_list)

    save_csv("../data/raw_data.csv", sms_words, sms_labels)

    return sms_labels, sms_words

sms_labels, sms_words = get_csv_from_txt("../data/SMSSpamCollection")

#### Loading the raw_data into lists

Defining a function to load data from csv to lists

In [7]:
def get_list_from_csv(file_path):
    with open(file_path, newline='') as f:
        reader = csv.reader(f)
        sms_words = list(reader)

    sms_labels = [x[0] for x in sms_words]

    for x in sms_words:
        del x[0]

    return sms_labels, sms_words

#### Splitting the data into train/validation/test datasets

Defining a function to split data into train/val/test sets and saving as csv

In [10]:
def train_val_test_split(data_x, data_y, val_per, test_per, tr_path, val_path, te_path, random_seed):
    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=test_per, shuffle=True, random_state=random_seed)

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_per/(1-test_per), shuffle=True, random_state=random_seed)

    save_csv(tr_path, x_train, y_train)
    save_csv(val_path, x_val, y_val)
    save_csv(te_path, x_test, y_test)

Defining file paths for train/val/test csv files

In [4]:
train_path = "../data/train_data.csv"
val_path = "../data/val_data.csv"
test_path = "../data/test_data.csv"

Splitting data using random seed 42

In [12]:
train_val_test_split(sms_words, sms_labels, 0.15, 0.15, train_path, val_path, test_path, 42)

#### Tracking the split csv using dvc

Initialising dvc

In [13]:
!cd .. && dvc init

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

Add all three csv files to dvc

In [14]:
!dvc add ../data/train_data.csv
!dvc add ../data/val_data.csv
!dvc add ../data/test_data.csv

[?25l                                                                          [32m⠋[0m Checking graph
Adding...                                                                       
![A
  0% Checking cache in '/mnt/vol_d/VSCode_Workspace/cmi_applied_ml/.dvc/cache'| [A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  7.90file/s][A

To track the changes with git, run:

	git add ../data/train_data.csv.dvc ../data/.gitignore

To enable auto staging, run:

	dvc config core.autostage true
[2K[32m⠋[0m Checking graph                                                   [32m⠋[0m Checking graph
Adding...                                                                       
![A
  0% Checking cache in '/mnt/vol_d/

In [15]:
!dvc config core.autostage true

[0m

Adding google drive folder as a remote data storage

In [16]:
!cd .. && dvc remote add --default myremote gdrive://1MypipdcBtjmYnO3OQQmLxKM3SWfwmE2p

Setting 'myremote' as a default remote.
[0m

In [17]:
!dvc remote modify myremote gdrive_acknowledge_abuse true

[0m

Pushing dvc tracked files to remote storage

In [18]:
!dvc push

  0% Transferring|                                   |0/3 [00:00<?,     ?file/s]
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_0.00/205k [00:00<?,        ?B/s][A
 33% Transferring|██████████▎                    |1/3 [00:04<00:08,  4.05s/file][A
                                                                                [A
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi0.00/44.6k [00:00<?,        ?B/s][A

![A[A

  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A[A

  0%|          |/mnt/vol_d/VSCode_Workspace/cmi0.00/45.8k [00:00<?,        ?B/s][A[A
 18%|█▊        |/mnt/vol_d/VSCode_Workspac8.00k/44.6k [00:02<00:10,    3.68kB/s][A

 67% Transferring|████████████████████▋          |2/3 [00:07<00:03,  3.85s/file][A[A
100% Transferring|██████████████████████████████

Splitting again using a different random seed (37)

In [26]:
train_val_test_split(sms_words, sms_labels, 0.15, 0.15, train_path, val_path, test_path, 37)

Tracking changes and commiting changes in dvc (commiting was done in terminal)

In [27]:
!dvc status

../data/test_data.csv.dvc:                                            core[39m>
	changed outs:
		modified:           ../data/test_data.csv
../data/train_data.csv.dvc:
	changed outs:
		modified:           ../data/train_data.csv
../data/val_data.csv.dvc:
	changed outs:
		modified:           ../data/val_data.csv
[0m

#### Checkout different Versions

Getting git log

In [31]:
!git log

[33mcommit 3b04052025e336e41dba9b34c19979ddde504f5c[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m, [m[1;31morigin/main[m[33m)[m
Author: Rohan Dharmadhikari <drohan.1994@gmail.com>
Date:   Sun Feb 26 18:10:18 2023 +0530

    Second Split 37

[33mcommit 7eda936db5773bfd8f14a81c222eba24b13ed5e5[m
Author: Rohan Dharmadhikari <drohan.1994@gmail.com>
Date:   Sun Feb 26 18:09:55 2023 +0530

    seond split

[33mcommit ebb8d9e034a82321295bc3ba7b8044ffe3c8cfb4[m[33m ([m[1;33mtag: v1.0[m[33m)[m
Author: Rohan Dharmadhikari <drohan.1994@gmail.com>
Date:   Sun Feb 26 17:53:30 2023 +0530

    First Split 42

[33mcommit d1ab32da26043d4a4ee492035b17ab39ba746064[m
Author: Rohan Dharmadhikari <drohan.1994@gmail.com>
Date:   Sun Feb 26 17:52:31 2023 +0530

    till first split

[33mcommit 22b95fa3548542decf4a9c302cbbfc08dc0970dc[m
Author: Rohan Dharmadhikari <drohan.1994@gmail.com>
Date:   Sun Feb 26 17:52:01 2023 +0530

    data dvc init

[33mcommit d7f0ba22fcb7c6a8868c4760762a0

Checking out version - "First Split 42"

In [46]:
!git checkout ebb8d9e034a82321295bc3ba7b8044ffe3c8cfb4

Previous HEAD position was 3b04052 Second Split 37
HEAD is now at ebb8d9e First Split 42


In [47]:
!dvc checkout

  0% Checkout|                                       |0/1 [00:00<?,     ?file/s]
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_0.00/205k [00:00<?,        ?B/s][A
100% Checkout|███████████████████████████████████|1/1 [00:00<00:00, 15.01file/s][A
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi0.00/45.8k [00:00<?,        ?B/s][A
  0% Checkout|                                   |2/? [00:00<00:00, 24.22file/s][A
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi0.00/44.6k [00:00<?,        ?B/s][A
[33mM[0m       ..[35m/data/[0m[95mtrain_data.csv[0m                       [A
[33mM[0m       ..[35m/data/[0m[95mtest_data.csv[0m
[33mM[0m       ..[35m/data/[0m[95mval_data.csv[0m
[0m

Getting distribution of sms_labels in split files

In [5]:
def get_label_dist(file_path):
    file_labels, file_words = get_list_from_csv(file_path)

    ham_count = file_labels.count("ham")
    spam_count = file_labels.count("spam")

    print("Ham: {}, Spam: {}".format(ham_count, spam_count))

In [49]:
print("First Split (42)")
print("Training dataset:")
get_label_dist(train_path)
print("Validation dataset:")
get_label_dist(val_path)
print("Testing dataset:")
get_label_dist(test_path)

First Split (42)
Training dataset:
Ham: 3403, Spam: 498
Validation dataset:
Ham: 712, Spam: 124
Testing dataset:
Ham: 712, Spam: 125


Checking out version - "Second Split 37"

In [50]:
!git checkout 3b04052025e336e41dba9b34c19979ddde504f5c

Previous HEAD position was ebb8d9e First Split 42
HEAD is now at 3b04052 Second Split 37


In [51]:
!dvc checkout

  0% Checkout|                                       |0/1 [00:00<?,     ?file/s]
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi0.00/44.0k [00:00<?,        ?B/s][A
100% Checkout|███████████████████████████████████|1/1 [00:00<00:00, 12.54file/s][A
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_0.00/207k [00:00<?,        ?B/s][A
  0% Checkout|                                   |2/? [00:00<00:00, 20.54file/s][A
![A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi_app0.00/? [00:00<?,        ?B/s][A
  0%|          |/mnt/vol_d/VSCode_Workspace/cmi0.00/43.6k [00:00<?,        ?B/s][A
[33mM[0m       ..[35m/data/[0m[95mtest_data.csv[0m                        [A
[33mM[0m       ..[35m/data/[0m[95mtrain_data.csv[0m
[33mM[0m       ..[35m/data/[0m[95mval_data.csv[0m
[0m

In [10]:
print("Second Split (37)")
print("Training dataset:")
get_label_dist(train_path)
print("Validation dataset:")
get_label_dist(val_path)
print("Testing dataset:")
get_label_dist(test_path)

Second Split (37)
Training dataset:
Ham: 3394, Spam: 507
Validation dataset:
Ham: 719, Spam: 117
Testing dataset:
Ham: 714, Spam: 123
