* track the versions of data using dvc
* load the raw data into raw_data.csv and save the split data into train.csv/validation.csv/test.csv
* update train/validation/test split by choosing different random seed
* checkout the first version (before update) using dvc and print the distribution of target variable (number of 0s and number of 1s) in train.csv, validation.csv, and test.csv


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
FILE_PATH = 'sms_spam_collection/SMSSpamCollection'

In [None]:
df = pd.read_csv(FILE_PATH, sep='\t', names=['label', 'message'])

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [None]:
df['preprocessed_message'] = df['message'].apply(preprocess_text)

In [None]:
df.sample(10)

In [None]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
import dvc.api
from dvc.repo import Repo

In [None]:
repo = Repo.init()

## Version 1

In [None]:
# split the data into train/validation/test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [None]:
# print percentage of each label
print("Train : ", train_df['label'].value_counts() / len(train_df))
print("Validation : ", val_df['label'].value_counts() / len(val_df))
print("Test : ", test_df['label'].value_counts() / len(test_df))

In [None]:
# store the splits at train.csv/validation.csv/test.csv
train_df.to_csv('processed_data/version_1/train.csv', index=False, sep='\t')
val_df.to_csv('processed_data/version_1/validation.csv', index=False, sep='\t')
test_df.to_csv('processed_data/version_1/test.csv', index=False, sep='\t')

In [None]:
repo.add('processed_data/version_1/train.csv')
repo.add('processed_data/version_1/validation.csv')
repo.add('processed_data/version_1/test.csv')
repo.commit('Version 1')

## Version 2

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=43)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=43)

print("Train : ", train_df['label'].value_counts() / len(train_df))
print("Validation : ", val_df['label'].value_counts() / len(val_df))
print("Test : ", test_df['label'].value_counts() / len(test_df))

In [None]:
train_df.to_csv('processed_data/version_2/train.csv', index=False, sep='\t')
val_df.to_csv('processed_data/version_2/validation.csv', index=False, sep='\t')
test_df.to_csv('processed_data/version_2/test.csv', index=False, sep='\t')

In [None]:
repo.add('processed_data/version_2/train.csv')
repo.add('processed_data/version_2/validation.csv')
repo.add('processed_data/version_2/test.csv')
repo.commit('Version 2')

## Load versions and check class distribution

### Version 1

In [None]:
repo.checkout('Version 1')

In [None]:
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))


### Version 2

In [None]:
repo.checkout('Version 2')
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

print("Train : ", train_data['label'].value_counts() / len(train_data))
print("Validation : ", val_data['label'].value_counts() / len(val_data))
print("Test : ", test_data['label'].value_counts() / len(test_data))