# Importing libraries

In [1]:
# importing necessary libraries
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize dvc

In [2]:
!dvc init --subdir

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

# Adding google drive as remote storage

In [3]:
!dvc remote add --default drive gdrive://1OONKIsu54eysQwijR4EaFTWV96e81NLB

Setting 'drive' as a default remote.
[0m

# Load and see data

In [4]:
# see the number of messages
messages = [line.rstrip() for line in open('./data/raw_data')]
print(len(messages))

5574


In [5]:
# see first 10 messages
for message_no, message in enumerate(messages[:10]):
    print(message_no, message)

0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
1 ham	Ok lar... Joking wif u oni...
2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3 ham	U dun say so early hor... U c already then say...
4 ham	Nah I don't think he goes to usf, he lives around here though
5 spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
6 ham	Even my brother is not like to speak with me. They treat me like aids patent.
7 ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8 spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 

# Useful functions

In [6]:
# function to load the data
def load_data(filepath):
    return pd.read_csv(filepath, sep='\t', quoting=csv.QUOTE_NONE,names = ['label', 'message'])

# set of english stopwords
stop_words = set(stopwords.words('english'))

# function for preprocessing messages
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword removal
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Converting all text to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Removing empty strings
    tokens = [token for token in tokens if token != '']
    
    return tokens

# function for encoding ham as 0 and spam as 1
def encode(text):
    if text == 'spam':
        return 1
    elif text == 'ham':
        return 0

# function to split into train and test data
def split_data(df, seed = 42, train_size = 0.7, test_size = 0.15):
    train_df, val_test_df = train_test_split(df, test_size=1-train_size, random_state = seed)
    val_df, test_df = train_test_split(val_test_df, test_size=test_size/(1-train_size), random_state = seed)
    return train_df, val_df, test_df

# function to store as a csv file
def store_as_csv(df, name):
    return df.to_csv(name, index=False)

In [7]:
# load the sms data
messages = load_data('data/raw_data')

In [8]:
messages.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# Save the raw data as a csv file

In [9]:
store_as_csv(messages, name = './data/raw_data.csv')

Add raw_data.csv to dvc

In [10]:
!dvc add "./data/raw_data.csv"

[?25l                                                                core[39m>[32m⠋[0m Checking graph
Adding...                                                                       
![A
Collecting files and computing hashes in data/raw_data.csv |0.00 [00:00,     ?fi[A
                                                                                [A
![A
  0% Checking cache in '/home/utpalraj/coursework/AML/AppliedMachineLearning/Ass[A
                                                                                [A
![A
  0%|          |Adding data/raw_data.csv to cache     0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/utpalraj/coursework0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 10.29file/s][A

To track the changes with git, run:

	git add data/.gitignore data/raw_data.csv.dvc

To enable auto staging, run:

	dvc 

In [11]:
!git add .
!git commit -m "added raw_csv"

[main 34c544b] added raw_csv
 6 files changed, 9880 insertions(+)
 create mode 100644 Assignment2/data/.gitignore
 create mode 100644 Assignment2/data/raw_data
 create mode 100644 Assignment2/data/raw_data.csv.dvc
 create mode 100644 Assignment2/prepare.ipynb
 create mode 100644 Assignment2/train.ipynb


# Data Preprocessing

In [12]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/utpalraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/utpalraj/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/utpalraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/utpalraj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing message

In [13]:
messages['message'] = messages['message'].apply(preprocess_text)

In [14]:
messages.head(10)

Unnamed: 0,label,message
0,ham,"[go, jurong, point, ,, crazy, .., available, b..."
1,ham,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor, ..., u, c, already, ..."
4,ham,"[nah, i, n't, think, go, usf, ,, life, around,..."
5,spam,"[freemsg, hey, darling, 's, 3, week, 's, word,..."
6,ham,"[even, brother, like, speak, ., they, treat, l..."
7,ham,"[as, per, request, 'melle, melle, (, oru, minn..."
8,spam,"[winner, !, !, as, valued, network, customer, ..."
9,spam,"[had, mobile, 11, month, ?, u, r, entitled, up..."


## Encode ham as 0 and spam as 1

In [15]:
messages['label'] = messages['label'].apply(encode)

# Splitting into Train, Validation and Test data

In [16]:
SEED1 = 2032
SEED2 = 202352

In [17]:
# Split the data into training, validation, and testing sets
train_df, val_df, test_df = split_data(messages, seed=SEED1)

# Saving Train, Validation and Test Data as csv files

### for `SEED1`

In [18]:
# Save the training, validation, and testing sets to CSV files
store_as_csv(train_df, name = './data/train.csv')
store_as_csv(val_df, name = './data/validation.csv')
store_as_csv(test_df, name = './data/test.csv')

Adding the train, validation and test data via dvc

In [19]:
!dvc add data/train.csv data/validation.csv data/test.csv -q
!git add .
!git commit -m "added data for random seed 2032" 

[?25l[32m⠋[0m Checking graph
[1A[2K[0m[main e20be57] added data for random seed 2032
 5 files changed, 45 insertions(+), 12 deletions(-)
 create mode 100644 Assignment2/data/test.csv.dvc
 create mode 100644 Assignment2/data/train.csv.dvc
 create mode 100644 Assignment2/data/validation.csv.dvc


### for `SEED2`

In [20]:
# Split the data into training, validation, and testing sets
train_df, val_df, test_df = split_data(messages, seed=SEED2)

In [21]:
# Save the training, validation, and testing sets to CSV files
store_as_csv(train_df, name = './data/train.csv')
store_as_csv(val_df, name = './data/validation.csv')
store_as_csv(test_df, name = './data/test.csv')

Adding train, validation and test data via dvc for different seed

In [None]:
!dvc add data/train.csv data/validation.csv data/test.csv -q
!git add .
!git commit -m "added data for random seed 202352" 

[?25l[32m⠋[0m Checking graph
[1A[2K[0mOn branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   .dvc/config[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mdata/[m
	[31mprepare.ipynb[m
	[31mtrain.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")
