### **Imports and Dependencies**

In [69]:
# Import Libraries
import pandas as pd
import string
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords


In [70]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sowmy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **Loading the Dataset**

In [71]:
file_path = "SMSSpamCollection"

df = pd.read_csv(file_path, sep="\t", header=None, names=["Label", "Message"])

df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [72]:
# Duplicate Values
df.duplicated().sum()

np.int64(403)

In [73]:
# Remove duplicate values
df = df.drop_duplicates(keep = 'first')

In [74]:
# Duplicate Values
df.duplicated().sum()

np.int64(0)

### **Encoding the Labels**

In [75]:
# Convert Labels(ham -> 0 and spam -> 1)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['Label'] = encoder.fit_transform(df['Label'])
df.head()

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### **Data Cleaning**

In [76]:
import nltk
nltk.download('punkt_tab')  
nltk.download('punkt')      

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sowmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sowmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [77]:
# Lower Case

def lower_text(text):
    if pd.isna(text):
        return ""
    return text.lower()


In [78]:
# Tokenization 

def tokennize_text(text):
    if text == "":
        return []
    return nltk.word_tokenize(text)


In [79]:
# Removing Special Characters
def remove_spec_char(text):
    y = []
    for txt in text:
        if txt.isalnum():
            y.append(txt)
    return y

In [80]:
# Removing Stopwords and Punctuation 
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words("english"))
punctuations = set(string.punctuation)

def remove_stop_pun(text):
    y = []
    for txt in text:
        if txt not in stop_words and txt not in punctuations:
            y.append(txt)
    return y


In [81]:
# Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stemming(text):
    if not text:
        return ""
    return " ".join(ps.stem(txt) for txt in text)


In [82]:
# Building the preprocessing Pipeline
pipeline = [
    lower_text,
    tokennize_text,
    remove_spec_char,
    remove_stop_pun,
    stemming
]

# Function to run all steps
def preprocess_text(text):
    for func in pipeline:
        text = func(text)
    return text


In [83]:
df["Processed_Message"] = df["Message"].apply(preprocess_text)

In [84]:
df = df[["Label", "Processed_Message"]].rename(columns={"Processed_Message": "Message"})

df.head()

Unnamed: 0,Label,Message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


### **Initialize Git & DVC**

In [85]:
# Initialize Git
!git init

Initialized empty Git repository in C:/Users/sowmy/Downloads/SEM4/AML/Assignment2/.git/


In [86]:
# Initialize DVC
!dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/treeverse/dvc>


### **Version 1(seed 21)**


In [87]:
train, test = train_test_split(df, test_size=0.2, random_state=21)
train, val = train_test_split(train, test_size=0.25, random_state=21)

# Save files
df.to_csv("raw_data.csv", index=False)
train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)


In [88]:
# Function to print distribution
def print_data_distribution(name, data):
    print(f"\n{name} Data Distribution:")
    print(data["Label"].value_counts())
    print("Total samples:", len(data))


In [89]:
print_data_distribution("Train", train)
print_data_distribution("Validation", val)
print_data_distribution("Test", test)


Train Data Distribution:
Label
0    2698
1     403
Name: count, dtype: int64
Total samples: 3101

Validation Data Distribution:
Label
0    915
1    119
Name: count, dtype: int64
Total samples: 1034

Test Data Distribution:
Label
0    903
1    131
Name: count, dtype: int64
Total samples: 1034


### **Track with DVC**

In [90]:
!dvc add raw_data.csv train.csv validation.csv test.csv


To track the changes with git, run:

	git add validation.csv.dvc raw_data.csv.dvc train.csv.dvc test.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [91]:
!git add raw_data.csv.dvc train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore
!git commit -m "Version 1: Split with seed 21"

[main (root-commit) 71366ff] Version 1: Split with seed 21
 8 files changed, 30 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 raw_data.csv.dvc
 create mode 100644 test.csv.dvc
 create mode 100644 train.csv.dvc
 create mode 100644 validation.csv.dvc


In [92]:
!git log

commit 71366ff8b3229631b2074d697828bc02e3cb8a9a
Author: Sowmya <sowmyaboda101@gmail.com>
Date:   Sun Feb 15 14:28:01 2026 +0530

    Version 1: Split with seed 21


### **Version 2(seed =77)**


In [93]:
train, test = train_test_split(df, test_size=0.2, random_state=77)
train, val = train_test_split(train, test_size=0.25, random_state=77)

train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)

In [94]:
print_data_distribution("Train", train)
print_data_distribution("Validation", val)
print_data_distribution("Test", test)


Train Data Distribution:
Label
0    2729
1     372
Name: count, dtype: int64
Total samples: 3101

Validation Data Distribution:
Label
0    884
1    150
Name: count, dtype: int64
Total samples: 1034

Test Data Distribution:
Label
0    903
1    131
Name: count, dtype: int64
Total samples: 1034


### **Track updated version**

In [95]:
!dvc add train.csv validation.csv test.csv


To track the changes with git, run:

	git add validation.csv.dvc train.csv.dvc test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [96]:
!git add train.csv.dvc validation.csv.dvc test.csv.dvc
!git commit -m "Version 2: Updated split with seed 77"

[main b207ee4] Version 2: Updated split with seed 77
 3 files changed, 6 insertions(+), 6 deletions(-)


### **Checkout OLD Version**

In [97]:
!git log --oneline

b207ee4 Version 2: Updated split with seed 77
71366ff Version 1: Split with seed 21


In [98]:
!git log

commit b207ee4ab8f99a46c7247b6d9fffcbadca0c4f40
Author: Sowmya <sowmyaboda101@gmail.com>
Date:   Sun Feb 15 14:28:18 2026 +0530

    Version 2: Updated split with seed 77

commit 71366ff8b3229631b2074d697828bc02e3cb8a9a
Author: Sowmya <sowmyaboda101@gmail.com>
Date:   Sun Feb 15 14:28:01 2026 +0530

    Version 1: Split with seed 21


### **Checkout First Version**

In [99]:
!git checkout 71366ff8b3229631b2074d697828bc02e3cb8a9a
!dvc checkout


Note: switching to '71366ff8b3229631b2074d697828bc02e3cb8a9a'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 71366ff Version 1: Split with seed 21


M       test.csv
M       train.csv
M       validation.csv


In [100]:
# Reload Files and Print Distribution
train = pd.read_csv("train.csv")
val = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

print("OLD VERSION (Seed 21)")
print_data_distribution("Train", train)
print_data_distribution("Validation", val)
print_data_distribution("Test", test)


OLD VERSION (Seed 21)

Train Data Distribution:
Label
0    2698
1     403
Name: count, dtype: int64
Total samples: 3101

Validation Data Distribution:
Label
0    915
1    119
Name: count, dtype: int64
Total samples: 1034

Test Data Distribution:
Label
0    903
1    131
Name: count, dtype: int64
Total samples: 1034


### **Checkout Updated Version**

In [101]:
!git checkout b207ee4ab8f99a46c7247b6d9fffcbadca0c4f40
!dvc checkout

Previous HEAD position was 71366ff Version 1: Split with seed 21
HEAD is now at b207ee4 Version 2: Updated split with seed 77


M       test.csv
M       train.csv
M       validation.csv


In [102]:
!git checkout main
!dvc checkout

Switched to branch 'main'


In [103]:
# Reload and Print Distribution
train = pd.read_csv("train.csv")
val = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

print("UPDATED VERSION (Seed 77)")
print_data_distribution("Train", train)
print_data_distribution("Validation", val)
print_data_distribution("Test", test)


UPDATED VERSION (Seed 77)

Train Data Distribution:
Label
0    2729
1     372
Name: count, dtype: int64
Total samples: 3101

Validation Data Distribution:
Label
0    884
1    150
Name: count, dtype: int64
Total samples: 1034

Test Data Distribution:
Label
0    903
1    131
Name: count, dtype: int64
Total samples: 1034
