* track the versions of data using dvc
* load the raw data into raw_data.csv and save the split data into train.csv/validation.csv/test.csv
* update train/validation/test split by choosing different random seed
* checkout the first version (before update) using dvc and print the distribution of target variable (number of 0s and number of 1s) in train.csv, validation.csv, and test.csv


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
FILE_PATH = 'sms_spam_collection/SMSSpamCollection'

In [3]:
df = pd.read_csv(FILE_PATH, sep='\t', names=['label', 'message'])

In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/turing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/turing/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/turing/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/turing/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [7]:
df['preprocessed_message'] = df['message'].apply(preprocess_text)

In [8]:
df.sample(10)

Unnamed: 0,label,message,preprocessed_message
2447,ham,"Sorry, I'll call later",sorry ill call later
2863,spam,Adult 18 Content Your video will be with you s...,adult content video shortly
57,ham,"Sorry, I'll call later in meeting.",sorry ill call later meeting
2664,spam,8007 FREE for 1st week! No1 Nokia tone 4 ur mo...,free st week nokia tone ur mob every week txt ...
1463,spam,09066362231 URGENT! Your mobile No 07xxxxxxxxx...,urgent mobile xxxxxxxxx £ bonus caller prize n...
5388,ham,NOT MUCH NO FIGHTS. IT WAS A GOOD NITE!!,much fight good nite
519,ham,That way transport is less problematic than on...,way transport less problematic sat night way u...
753,ham,Dont gimme that lip caveboy,dont gim lip caveboy
1268,ham,SERIOUSLY. TELL HER THOSE EXACT WORDS RIGHT NOW.,seriously tell exact word right
4265,ham,She just broke down a list of reasons why nobo...,broke list reason nobody town cant tell shes s...


In [9]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

## Version 1

In [None]:
# split the data into train/validation/test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [17]:
import os
os.makedirs('processed_data/version_1', exist_ok=True)

In [None]:
# print percentage of each label
print("Train : ", train_df['label'].value_counts() / len(train_df))
print("Validation : ", val_df['label'].value_counts() / len(val_df))
print("Test : ", test_df['label'].value_counts() / len(test_df))

In [None]:
# store the splits at train.csv/validation.csv/test.csv
train_df.to_csv('processed_data/version_1/train.csv', index=False, sep='\t')
val_df.to_csv('processed_data/version_1/validation.csv', index=False, sep='\t')
test_df.to_csv('processed_data/version_1/test.csv', index=False, sep='\t')

In [None]:
repo.add("processed_data/version_1/train.csv")
repo.add("processed_data/version_1/validation.csv")
repo.add("processed_data/version_1/test.csv")

repo.scm.add([
    "processed_data/version_1/*.csv",
    ".gitignore"
])
repo.scm.commit("version 1")
repo.scm.tag('v1')

## Version 2

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=43)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=43)

print("Train : ", train_df['label'].value_counts() / len(train_df))
print("Validation : ", val_df['label'].value_counts() / len(val_df))
print("Test : ", test_df['label'].value_counts() / len(test_df))

In [40]:
import os
os.makedirs('processed_data/version_2', exist_ok=True)

In [41]:
train_df.to_csv('processed_data/version_2/train.csv', index=False, sep='\t')
val_df.to_csv('processed_data/version_2/validation.csv', index=False, sep='\t')
test_df.to_csv('processed_data/version_2/test.csv', index=False, sep='\t')

In [None]:
repo.add("processed_data/version_2/train.csv")
repo.add("processed_data/version_2/validation.csv")
repo.add("processed_data/version_2/test.csv")

repo.scm.add([
    "processed_data/version_2/*.csv.dvc",
    ".gitignore"
])
repo.scm.commit("version 2")
repo.scm.tag('v2')

## Load versions and check class distribution

In [None]:
repo.scm

### Version 1

In [None]:
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))


### Version 2

In [None]:
repo.checkout('Version 2')
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))