In [None]:
import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\X
[nltk_data]     Warrior\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\X
[nltk_data]     Warrior\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def preprocess_data(df):
    # Lowercase all text
    df['message'] = df['message'].str.lower()

    # Remove punctuation
    df['message'] = df['message'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    # Remove numbers
    df['message'] = df['message'].apply(lambda x: re.sub(r'\d+', '', x))

    # Tokenize the text
    df['message'] = df['message'].apply(lambda x: x.split())

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    df['message'] = df['message'].apply(lambda x: [word for word in x if word not in stop_words])

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    df['message'] = df['message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    # Join tokens back into a single string
    df['message'] = df['message'].apply(lambda x: ' '.join(x))

    # Encode labels 
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})

    # Drop duplicates
    df = df.drop_duplicates()

    return df


def store_data(df, seed=42):
    train, test = train_test_split(df, test_size=0.2, random_state=seed)
    train, val = train_test_split(train, test_size=0.1, random_state=seed)
    
    train.to_csv('train.csv', index=False)
    val.to_csv('validation.csv', index=False)
    test.to_csv('test.csv', index=False)


In [3]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'message'])
data = preprocess_data(data)
data.to_csv("raw_data.csv", index=False)
store_data(data)

In [4]:
!dvc add raw_data.csv train.csv validation.csv test.csv
!git add raw_data.csv.dvc train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore
!git commit -m "Added raw and split datasets"


To track the changes with git, run:

	git add train.csv.dvc test.csv.dvc raw_data.csv.dvc .gitignore validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



[main 2a22d68] Added raw and split datasets
 8 files changed, 30 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 Assignment_2/.gitignore
 create mode 100644 Assignment_2/raw_data.csv.dvc
 create mode 100644 Assignment_2/test.csv.dvc
 create mode 100644 Assignment_2/train.csv.dvc
 create mode 100644 Assignment_2/validation.csv.dvc


In [5]:
store_data(data, seed=57)

In [6]:
!dvc add train.csv validation.csv test.csv
!git commit -am "Updated train/validation/test split with new random seed"


To track the changes with git, run:

	git add test.csv.dvc validation.csv.dvc train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



[main 3e84669] Updated train/validation/test split with new random seed
 4 files changed, 6 insertions(+), 5580 deletions(-)
 delete mode 100644 Assignment 1/SMSSpamCollection.txt


In [7]:
!git checkout HEAD~1  
!dvc checkout

Note: switching to 'HEAD~1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 2a22d68 Added raw and split datasets


M       validation.csv
M       test.csv
M       train.csv


In [8]:
for filename in ["train.csv", "validation.csv", "test.csv"]:
    df = pd.read_csv(filename)
    print(f"Distribution in {filename}:")
    print(df["label"].value_counts(), "\n")


Distribution in train.csv:
label
0    3228
1     426
Name: count, dtype: int64 

Distribution in validation.csv:
label
0    357
1     49
Name: count, dtype: int64 

Distribution in test.csv:
label
0    898
1    117
Name: count, dtype: int64 



In [9]:
!git checkout main  
!dvc checkout

Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)


Previous HEAD position was 2a22d68 Added raw and split datasets
Switched to branch 'main'


M       validation.csv
M       train.csv
M       test.csv


In [10]:
for filename in ["train.csv", "validation.csv", "test.csv"]:
    df = pd.read_csv(filename)
    print(f"Distribution in {filename}:")
    print(df["label"].value_counts(), "\n")


Distribution in train.csv:
label
0    3233
1     421
Name: count, dtype: int64 

Distribution in validation.csv:
label
0    357
1     49
Name: count, dtype: int64 

Distribution in test.csv:
label
0    893
1    122
Name: count, dtype: int64 

