# Importing libraries

In [1]:
# importing necessary libraries
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Useful functions

In [36]:
# function to load the data
def load_data(filepath):
    return pd.read_csv(filepath, sep='\t', quoting=csv.QUOTE_NONE,names = ['label', 'message'])

# set of english stopwords
stop_words = set(stopwords.words('english'))

# function for preprocessing messages
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword removal
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Converting all text to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Removing empty strings
    tokens = [token for token in tokens if token != '']
    
    return tokens

# function for encoding ham as 0 and spam as 1
def encode(text):
    if text == 'spam':
        return 1
    elif text == 'ham':
        return 0
    
# function to split into train and test data
def split_data(df, seed = 42, train_size = 0.7, val_size = 0.15, test_size = 0.15):
    train_df, val_test_df = train_test_split(df, test_size=1-train_size, random_state = seed)
    val_df, test_df = train_test_split(val_test_df, test_size=test_size/(1-train_size), random_state = seed)
    return train_df, val_df, test_df

# function to store as a csv file
def store_as_csv(df, name):
    return df.to_csv(name, index=False)

# function to print variable distribution
def print_variable_distribution(data, file_name):
    print("Number of 0s in " + file_name + " - " + str(len(data) - data.label.sum()))
    print("Number of 1s in " + file_name + " - " + str(data.label.sum()))

# Load the data

In [5]:
# load the sms data
messages = load_data('data/raw_data')

In [6]:
messages.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
store_as_csv(messages,name='./data/raw_data.csv')

# Initialize dvc

In [8]:
!dvc init --subdir

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

Adding google drive as remote storage

In [47]:
# !dvc remote add --default myremote gdrive://1OONKIsu54eysQwijR4EaFTWV96e81NLB
# !dvc remote modify myremote gdrive_acknowledge_abuse true
# !dvc push

In [9]:
!git add .dvc/config
# !git commit -m "Adding Gdrive as Remote"

Store raw_data.csv using dvc

In [10]:
!git rm -r --cached 'data/raw_data.csv'
!dvc add data/raw_data.csv 
!git commit -m "Added raw_data.csv"

fatal: pathspec 'data/raw_data.csv' did not match any files


[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/raw_data.csv |0.00 [00:00,     ?fi[A
                                                                                [A
![A
  0% Checking cache in '/home/utpalraj/coursework/AML/AppliedMachineLearning/Ass[A
                                                                                [A
![A
  0%|          |Adding data/raw_data.csv to cache     0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/utpalraj/coursework0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  8.40file/s][A

To track the changes with git, run:

	git add data/raw_data.csv.dvc data/.gitignore

To enable auto staging, run:

	dvc config core.autostage tru

In [11]:
!dvc status

Data and pipelines are up to date.                                              
[0m

# Data Preprocessing

In [12]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/utpalraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/utpalraj/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/utpalraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/utpalraj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing message

In [13]:
messages['message'] = messages['message'].apply(preprocess_text)

In [14]:
messages.head(10)

Unnamed: 0,label,message
0,ham,"[go, jurong, point, ,, crazy, .., available, b..."
1,ham,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor, ..., u, c, already, ..."
4,ham,"[nah, i, n't, think, go, usf, ,, life, around,..."
5,spam,"[freemsg, hey, darling, 's, 3, week, 's, word,..."
6,ham,"[even, brother, like, speak, ., they, treat, l..."
7,ham,"[as, per, request, 'melle, melle, (, oru, minn..."
8,spam,"[winner, !, !, as, valued, network, customer, ..."
9,spam,"[had, mobile, 11, month, ?, u, r, entitled, up..."


## Encode ham as 0 and spam as 1

In [15]:
messages['label'] = messages['label'].apply(encode)

# Splitting into Train, Validation and Test data

In [16]:
SEED1 = 2032
SEED2 = 2001

## using `SEED1`

In [17]:
# Split the data into training, validation, and testing sets
train_df, val_df, test_df = split_data(messages, seed=SEED1)

### Saving Train, Validation and Test Data as csv files

In [18]:
# Save the training, validation, and testing sets to CSV files
store_as_csv(train_df, name = './data/train.csv')
store_as_csv(val_df, name = './data/validation.csv')
store_as_csv(test_df, name = './data/test.csv')

In [19]:
!dvc status

Data and pipelines are up to date.                                              
[0m

In [20]:
!dvc add "./data/train.csv"
!dvc add "./data/validation.csv"
!dvc add "./data/test.csv"
!git add .
!git commit -m "Added train, validation and test data for SEED1"

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/train.csv |0.00 [00:00,     ?file/[A
                                                                                [A
![A
  0% Checking cache in '/home/utpalraj/coursework/AML/AppliedMachineLearning/Ass[A
                                                                                [A
![A
  0%|          |Adding data/train.csv to cache        0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/utpalraj/coursework0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 11.20file/s][A

To track the changes with git, run:

	git add data/.gitignore data/train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


## Using `SEED2`

In [21]:
# Split the data into training, validation, and testing sets
train_df, val_df, test_df = split_data(messages, seed=SEED2)

### Saving Train, Validation and Test Data as csv files

In [22]:
# Save the training, validation, and testing sets to CSV files
store_as_csv(train_df, name = './data/train.csv')
store_as_csv(val_df, name = './data/validation.csv')
store_as_csv(test_df, name = './data/test.csv')

In [23]:
!dvc status

data/train.csv.dvc:                                                             
	changed outs:
		modified:           data/train.csv
data/validation.csv.dvc:
	changed outs:
		modified:           data/validation.csv
data/test.csv.dvc:
	changed outs:
		modified:           data/test.csv
[0m

In [24]:
!dvc add "./data/train.csv"
!dvc add "./data/validation.csv"
!dvc add "./data/test.csv"
!git add .
!git commit -m "Added train, validation and test data for SEED2"

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/train.csv |0.00 [00:00,     ?file/[A
                                                                                [A
![A
  0% Checking cache in '/home/utpalraj/coursework/AML/AppliedMachineLearning/Ass[A
                                                                                [A
![A
  0%|          |Adding data/train.csv to cache        0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/utpalraj/coursework0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 20.17file/s][A

To track the changes with git, run:

	git add data/train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[?25l[32m⠋[0m 

# Target Variable Distribution of the First Version of Data

In [25]:
!git log --oneline

[33m3eb4bc5[m[33m ([m[1;36mHEAD[m[33m)[m Added train, validation and test data for SEED2
[33m2ab030d[m Added train, validation and test data for SEED1
[33mc06e4a6[m Added raw_data.csv
[33m4e5e5e6[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m Minor correction
[33m4277718[m Corrected some minor errors
[33ma60f4a4[m Minor Change to train.ipynb
[33m57f6a3c[m updated notebooks
[33m497550a[m updated notebooks
[33m8c900f7[m Updated Notebooks
[33mebfba18[m updated
[33m4d1d301[m updated train.ipynb
[33m4969443[m Updated Notebooks
[33ma48d8d4[m Updated prepare.ipynb
[33m52bbc8e[m Modified prepare.ipynb and generated output
[33mf2a81a0[m Downloaded Data and Segmented
[33meb4aabb[m Created prepare and train notebooks
[33me9305a1[m Initial commit


In [None]:
!git checkout e20be57 "./data/train.csv.dvc"
!git checkout e20be57 "./data/validation.csv.dvc"
!git checkout e20be57 "./data/test.csv.dvc"

Updated 0 paths from 11a9884
Updated 1 path from 11a9884
Updated 1 path from 11a9884


In [33]:
!dvc pull

Collecting                                            |0.00 [00:00,    ?entry/s]
Fetching
Building workspace index                              |5.00 [00:00,  331entry/s]
Comparing indexes                                     |6.00 [00:00,  876entry/s]
Applying changes                                      |3.00 [00:00,   130file/s]
[33mM[0m       data/test.csv
[33mM[0m       data/train.csv
[33mM[0m       data/validation.csv
[33mNo remote provided and no default remote set.[0m
3 files modified
[0m

In [34]:
train = pd.read_csv('./data/train.csv')
validate = pd.read_csv('./data/validation.csv')
test = pd.read_csv('./data/test.csv')

In [38]:
print_variable_distribution(train, "train.csv")
print_variable_distribution(validate, "validation.csv")
print_variable_distribution(test, "test.csv")

Number of 0s in train.csv - 3384
Number of 1s in train.csv - 517
Number of 0s in validation.csv - 726
Number of 1s in validation.csv - 110
Number of 0s in test.csv - 717
Number of 1s in test.csv - 120


# Target Variable Distribution of the Updated Version of Data

In [39]:
!git log --oneline

[33m3eb4bc5[m[33m ([m[1;36mHEAD[m[33m)[m Added train, validation and test data for SEED2
[33m2ab030d[m Added train, validation and test data for SEED1
[33mc06e4a6[m Added raw_data.csv
[33m4e5e5e6[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m Minor correction
[33m4277718[m Corrected some minor errors
[33ma60f4a4[m Minor Change to train.ipynb
[33m57f6a3c[m updated notebooks
[33m497550a[m updated notebooks
[33m8c900f7[m Updated Notebooks
[33mebfba18[m updated
[33m4d1d301[m updated train.ipynb
[33m4969443[m Updated Notebooks
[33ma48d8d4[m Updated prepare.ipynb
[33m52bbc8e[m Modified prepare.ipynb and generated output
[33mf2a81a0[m Downloaded Data and Segmented
[33meb4aabb[m Created prepare and train notebooks
[33me9305a1[m Initial commit


In [40]:
!git checkout 3eb4bc5 "./data/train.csv.dvc"
!git checkout 3eb4bc5 "./data/validation.csv.dvc"
!git checkout 3eb4bc5 "./data/test.csv.dvc"

Updated 1 path from eba9b99
Updated 1 path from eba9b99
Updated 1 path from eba9b99


In [41]:
!dvc pull

Collecting                                            |0.00 [00:00,    ?entry/s]
Fetching
Building workspace index                              |5.00 [00:00,  517entry/s]
Comparing indexes                                     |6.00 [00:00,  868entry/s]
Applying changes                                      |3.00 [00:00,   197file/s]
[33mM[0m       data/train.csv
[33mM[0m       data/validation.csv
[33mM[0m       data/test.csv
[33mNo remote provided and no default remote set.[0m
3 files modified
[0m

In [42]:
train = pd.read_csv('./data/train.csv')
validate = pd.read_csv('./data/validation.csv')
test = pd.read_csv('./data/test.csv')

In [43]:
print_variable_distribution(train, "train.csv")
print_variable_distribution(validate, "validation.csv")
print_variable_distribution(test, "test.csv")

Number of 0s in train.csv - 3379
Number of 1s in train.csv - 522
Number of 0s in validation.csv - 725
Number of 1s in validation.csv - 111
Number of 0s in test.csv - 723
Number of 1s in test.csv - 114
