In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string

In [15]:
data = pd.read_csv("SMSSpamCollection", sep='\t', names=['category', 'text'])

print("\nDataset Overview:")
print(data.head())

# Convert labels to lowercase and validate
data['category'] = data['category'].str.lower()
if not set(data['category']).issubset({'ham', 'spam'}):
    raise ValueError("Labels must be 'ham' or 'spam'")

# Encode labels: 'spam' -> 1, 'ham' -> 0
data['category'] = (data['category'] == 'spam').astype(int)
# Handle missing values
if data.isnull().sum().any():
    print("\nMissing values detected. Replacing with empty strings.")
    data['text'].fillna('', inplace=True)

print("\nDataset Statistics:")
print(data['category'].value_counts(normalize=True))


Dataset Overview:
  category                                               text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

Dataset Statistics:
category
0    0.865937
1    0.134063
Name: proportion, dtype: float64


In [16]:
def preprocess_text(text):   # import raw txt data 
    text = text.lower()
    words = nltk.word_tokenize(text)
    meaningful_instances = []
    
    for i in words:
        if i.isalnum() and i not in stopwords.words('english') and i not in string.punctuation:
            meaningful_instances.append(i)        
    final_text = []
    
    for i in meaningful_instances:
        final_text.append(i)
        
    return " ".join(final_text)

data["transformed_text"] = data["text"].map(preprocess_text)

In [17]:
data_train, data_temp = train_test_split(data, test_size=0.3, random_state=2)
data_val, data_test = train_test_split(data_temp, test_size=0.5, random_state=2)

In [27]:
data.to_csv("raw_data.csv", index= False)
data_train.to_csv("train.csv", index = False)
data_test.to_csv("test.csv", index = False)
data_val.to_csv("validation.csv", index = False)

In [19]:
! git init

Reinitialized existing Git repository in C:/Users/chaud/OneDrive/Desktop/Applied ML/Assignment 2/.git/


In [20]:
! dvc init -f

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


In [28]:
!dvc add raw_data.csv train.csv validation.csv test.csv
!git add raw_data.csv.dvc train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore
!git commit -m "Added raw and split datasets with seed 2"


To track the changes with git, run:

	git add .gitignore raw_data.csv.dvc train.csv.dvc test.csv.dvc validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


\u280b Checking graph



[detached HEAD 035afee] Added raw and split datasets with seed 2
 5 files changed, 24 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 raw_data.csv.dvc
 create mode 100644 test.csv.dvc
 create mode 100644 train.csv.dvc
 create mode 100644 validation.csv.dvc


In [22]:
!git log

commit 45b8f81ad6345a4c84809b136c9eb9e71315d61b
Author: Vikas9758 <chaudharyvikasjazz111@gmail.com>
Date:   Tue Mar 4 23:02:45 2025 +0530

    Added raw and split datasets with seed 2

commit 1a8d2fc2f4f5f5396b95120f6aea6b915dd624d7
Author: Vikas9758 <chaudharyvikasjazz111@gmail.com>
Date:   Tue Mar 4 21:51:10 2025 +0530

    initialised dvc


In [23]:
data_train, data_temp = train_test_split(data, test_size=0.3, random_state=55)
data_val, data_test = train_test_split(data_temp, test_size=0.5, random_state=55)
data.to_csv("raw_data.csv", index= False)
data_train.to_csv("train.csv", index = False)
data_test.to_csv("test.csv", index = False)
data_val.to_csv("validation.csv", index = False)

In [24]:
!dvc add train.csv validation.csv test.csv
!git commit -am "Updated train/validation/test split with seed 55"


To track the changes with git, run:

	git add validation.csv.dvc test.csv.dvc train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


\u280b Checking graph



[detached HEAD 9614bd3] Updated train/validation/test split with seed 55
 3 files changed, 6 insertions(+), 6 deletions(-)


In [25]:
!git log

commit 9614bd3fd628534f48217cd49b77dc5333653948
Author: Vikas9758 <chaudharyvikasjazz111@gmail.com>
Date:   Tue Mar 4 23:02:46 2025 +0530

    Updated train/validation/test split with seed 55

commit 45b8f81ad6345a4c84809b136c9eb9e71315d61b
Author: Vikas9758 <chaudharyvikasjazz111@gmail.com>
Date:   Tue Mar 4 23:02:45 2025 +0530

    Added raw and split datasets with seed 2

commit 1a8d2fc2f4f5f5396b95120f6aea6b915dd624d7
Author: Vikas9758 <chaudharyvikasjazz111@gmail.com>
Date:   Tue Mar 4 21:51:10 2025 +0530

    initialised dvc


In [26]:
!git checkout 1a8d2fc2f4f5f5396b95120f6aea6b915dd624d7
!dvc checkout

any of your branches:

  9614bd3 Updated train/validation/test split with seed 55
  45b8f81 Added raw and split datasets with seed 2

If you want to keep them by creating a new branch, this may be a good time
to do so with:

 git branch <new-branch-name> 9614bd3

HEAD is now at 1a8d2fc initialised dvc


D       train.csv
D       validation.csv
D       test.csv
