### Import Stuff

In [5]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import os

### Read Stuff

In [6]:

os.makedirs('data', exist_ok=True)

def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

file_path = "sms+spam+collection/SMSSpamCollection"

ham_messages = []
spam_messages = []

with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

for line in tqdm(lines, desc="reading file", unit="line"):
    if line.startswith("ham"):
        ham_messages.append(preprocess_text(line[4:].strip()))
    elif line.startswith("spam"):
        spam_messages.append(preprocess_text(line[5:].strip()))

data = pd.DataFrame({
    "text": ham_messages + spam_messages,
    "label": [0] * len(ham_messages) + [1] * len(spam_messages)  # 1 for spam, 0 for ham
})

# Save raw data
data.to_csv("data/raw_data.csv", index=False)
print("Raw data saved to data/raw_data.csv")

train, temp_df = train_test_split(data, test_size=0.5, random_state=42)
validation, test = train_test_split(temp_df, test_size=0.5, random_state=42)

train.to_csv("data/train.csv", index=False)
validation.to_csv("data/validation.csv", index=False)
test.to_csv("data/test.csv", index=False)

print("First version splits saved with random_state=42")
print(f"Train size: {len(train)}, Validation size: {len(validation)}, Test size: {len(test)}")

print("\nFirst Version - Distribution of Target Variable:")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())




reading file:   0%|          | 0/5574 [00:00<?, ?line/s]

reading file: 100%|██████████| 5574/5574 [00:04<00:00, 1349.25line/s]

Raw data saved to data/raw_data.csv
First version splits saved with random_state=42
Train size: 2787, Validation size: 1393, Test size: 1394

First Version - Distribution of Target Variable:
Train: {0: 2404, 1: 383}
Validation: {0: 1212, 1: 181}
Test: {0: 1211, 1: 183}





### Initialize DVC Stuff

In [8]:
!dvc init --subdir -f

!dvc add data/raw_data.csv
!dvc add data/train.csv
!dvc add data/validation.csv
!dvc add data/test.csv

!git add data/.gitignore data/*.dvc .dvc
!git commit -m "First data split with seed 42"

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[?25l[32m⠋[0m Checking graph                                       co

### Commit Hash Stuff

In [9]:
import subprocess
commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip()
with open('first_version_commit.txt', 'w') as f:
    f.write(commit_hash)
print(f"First version commit hash: {commit_hash}")

First version commit hash: bfa20f525a1c866dd8ff7daaac1ccd228fbd9ca1


### Display Stuff

In [10]:
train, temp_df = train_test_split(data, test_size=0.5, random_state=123)
validation, test = train_test_split(temp_df, test_size=0.5, random_state=123)

train.to_csv("data/train.csv", index=False)
validation.to_csv("data/validation.csv", index=False)
test.to_csv("data/test.csv", index=False)

print("\nSecond version splits saved with random_state=123")
print(f"Train size: {len(train)}, Validation size: {len(validation)}, Test size: {len(test)}")

print("\nSecond Version - Distribution of Target Variable:")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())


Second version splits saved with random_state=123
Train size: 2787, Validation size: 1393, Test size: 1394

Second Version - Distribution of Target Variable:
Train: {0: 2413, 1: 374}
Validation: {0: 1208, 1: 185}
Test: {0: 1206, 1: 188}


### Add DVC Stuff

In [11]:
!dvc add data/train.csv
!dvc add data/validation.csv
!dvc add data/test.csv

!git add data/*.dvc
!git commit -m "Updated data split with seed 123"

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/train.csv |0.00 [00:00,     ?file/[A
                                                                                [A
![A
  0% Checking cache in '/Users/shoru/Desktop/AML_git/Applied-Machine-Learning/As[A
                                                                                [A
![A
  0%|          |Adding data/train.csv to cache        0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /Users/shoru/Desktop/AML_0/1 [00:00<?,    ?files/s][A
100% Adding...|███████████████████████████████████████|1/1 [00:00, 106.80file/s][A

To track the changes with git, run:

	git add data/train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[?25l[32m⠋[0m 

### Display Second Commit Stuff

In [12]:
commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip()
with open('second_version_commit.txt', 'w') as f:
    f.write(commit_hash)
print(f"Second version commit hash: {commit_hash}")

print("\n=== Checking out first version ===")
!git checkout $(cat first_version_commit.txt) -- data/*.dvc
!dvc checkout

train = pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')


print("\nDistribution for first version (seed 42):")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())

Second version commit hash: 19353c399dc18a3f7fd271273063de1c17680d4b

=== Checking out first version ===
Building workspace index                              |5.00 [00:00,  956entry/s]
Comparing indexes                                    |6.00 [00:00, 6.98kentry/s]
Applying changes                                      |3.00 [00:00, 3.71kfile/s]
[33mM[0m       data/test.csv
[33mM[0m       data/train.csv
[33mM[0m       data/validation.csv
[0m
Distribution for first version (seed 42):
Train: {0: 2404, 1: 383}
Validation: {0: 1212, 1: 181}
Test: {0: 1211, 1: 183}


### Switch to Second Version

In [13]:
print("\n=== Checking out second version ===")
!git checkout $(cat second_version_commit.txt) -- data/*.dvc
!dvc checkout

train = pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

print("\nDistribution for second version (seed 123):")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(data['text'])
import pickle
with open('data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


=== Checking out second version ===
Building workspace index                             |5.00 [00:00, 1.35kentry/s]
Comparing indexes                                    |6.00 [00:00, 7.09kentry/s]
Applying changes                                      |3.00 [00:00, 3.22kfile/s]
[33mM[0m       data/test.csv
[33mM[0m       data/train.csv
[33mM[0m       data/validation.csv
[0m
Distribution for second version (seed 123):
Train: {0: 2413, 1: 374}
Validation: {0: 1208, 1: 185}
Test: {0: 1206, 1: 188}
