Loading the data

In [1]:
# Loading the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Loading the data
data = pd.read_csv("D:\CMI\CMI_sem4\Applied machine learning\Assignments\Assignment2_updated\emails.csv")
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
# Seeing the data distribution

# Calculate counts of each class
class_counts = data['spam'].value_counts().rename({1: 'Spam', 0: 'Not Spam'})

# Plotting the pie chart
plt.figure(figsize=(6, 6))
labels = [f'{class_counts.index[i]}\n{class_counts.iloc[i]} ({class_counts.iloc[i]/len(data)*100:.1f}%)' for i in range(len(class_counts))]
plt.pie(class_counts, labels=labels, startangle=80, labeldistance=0.4, textprops={'fontsize': 12})
plt.title('Class Distribution of Emails')
plt.axis('equal')

# Show the plot
plt.show()

Pre-processing the data

In [None]:
# Removing the phrase 'subject'

# Define a function to remove "Subject :" prefix
def remove_subject_prefix(subject):
    return subject.replace('Subject: ', '')

# Apply the function to the 'text' column
data['text'] = data['text'].apply(remove_subject_prefix)

In [None]:
# Removing punctuation, numbers, emails, URLs, special characters and converting to lowercase

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    return text.strip()

data['text'] = data['text'].apply(preprocess_text)

In [None]:
# Tokenization
data['text'] = data['text'].apply(word_tokenize)

# Stopword Removal
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Normalization
data['text'] = data['text'].apply(lambda x: [re.sub(r'[^a-zA-Z0-9\s]', '', word) for word in x])

In [None]:
# Joining the list of words to sentence
data['text'] = data['text'].apply(lambda x:' '.join(x))

In [3]:
! git init

Initialized empty Git repository in D:/CMI/CMI_sem4/Applied machine learning/Assignments/Assignment2_updated/.git/


In [4]:
!dvc init -f

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


Splitting the data

In [5]:
X,Y = data['text'], data['spam']

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, shuffle=True, test_size=0.2, random_state=100)
train_X, val_X, train_Y, val_Y = train_test_split(train_X, train_Y, shuffle=True, test_size=0.25, random_state=100)

In [6]:
train_data = pd.concat([train_X, train_Y], axis=1)
train_data.to_csv('train.csv', index=False)

test_data = pd.concat([test_X, test_Y], axis=1)
test_data.to_csv('test.csv', index=False)

val_data = pd.concat([val_X, val_Y], axis=1)
val_data.to_csv('validation.csv', index=False)

In [7]:
!dvc add emails.csv train.csv validation.csv test.csv


To track the changes with git, run:

	git add test.csv.dvc train.csv.dvc validation.csv.dvc emails.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [8]:
!git add .dvc/config

In [9]:
!git commit -m "init"

[master (root-commit) 3d3605b] init
 3 files changed, 6 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore


In [10]:
!git add emails.csv.dvc train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore


In [11]:
!git commit -m "original data"

[master b763960] original data
 5 files changed, 24 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 emails.csv.dvc
 create mode 100644 test.csv.dvc
 create mode 100644 train.csv.dvc
 create mode 100644 validation.csv.dvc


Updating the data

In [12]:
X,Y = data['text'], data['spam']

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, shuffle=True, test_size=0.2, random_state=101)
train_X, val_X, train_Y, val_Y = train_test_split(train_X, train_Y, shuffle=True, test_size=0.25, random_state=101)

In [13]:
train_data = pd.concat([train_X, train_Y], axis=1)
train_data.to_csv('train.csv', index=False)

test_data = pd.concat([test_X, test_Y], axis=1)
test_data.to_csv('test.csv', index=False)

val_data = pd.concat([val_X, val_Y], axis=1)
val_data.to_csv('validation.csv', index=False)

In [14]:
!dvc add train.csv validation.csv test.csv

⠋ Checking graph




To track the changes with git, run:

	git add train.csv.dvc validation.csv.dvc test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


In [15]:
!git add train.csv.dvc validation.csv.dvc test.csv.dvc

In [16]:
!git commit -m "Updated data"

[master b9187f4] Updated data
 3 files changed, 6 insertions(+), 6 deletions(-)


In [17]:
!git log

commit b9187f450dac6e4d565d01916cd88b21ecc998cb
Author: Swastika <swastikamohapatra2@gmail.com>
Date:   Wed Feb 21 13:45:33 2024 +0530

    Updated data

commit b76396094f8b856762a8b50b98c7a993da0f785f
Author: Swastika <swastikamohapatra2@gmail.com>
Date:   Wed Feb 21 13:45:08 2024 +0530

    original data

commit 3d3605bd8e332d08756e20e4370c86c892386af2
Author: Swastika <swastikamohapatra2@gmail.com>
Date:   Wed Feb 21 13:45:02 2024 +0530

    init


Distribution of the original data 

In [18]:
!git checkout b76396094f8b856762a8b50b98c7a993da0f785f train.csv.dvc validation.csv.dvc test.csv.dvc

Updated 3 paths from 99d16bf


In [19]:
!dvc checkout


M       train.csv
M       test.csv
M       validation.csv


In [20]:
train_df = pd.read_csv('train.csv')
train_df['spam'].value_counts()

spam
0    2643
1     793
Name: count, dtype: int64

In [21]:
test_df = pd.read_csv('test.csv')
test_df['spam'].value_counts()

spam
0    845
1    301
Name: count, dtype: int64

In [22]:
val_df = pd.read_csv('validation.csv')
val_df['spam'].value_counts()

spam
0    872
1    274
Name: count, dtype: int64

Distribution of the updated data

In [23]:
!git checkout b9187f450dac6e4d565d01916cd88b21ecc998cb train.csv.dvc validation.csv.dvc test.csv.dvc

Updated 3 paths from 4bcd5a3


In [24]:
!dvc checkout

M       test.csv
M       train.csv
M       validation.csv


In [25]:
train_df = pd.read_csv('train.csv')
train_df['spam'].value_counts()

spam
0    2635
1     801
Name: count, dtype: int64

In [26]:
test_df = pd.read_csv('test.csv')
test_df['spam'].value_counts()

spam
0    875
1    271
Name: count, dtype: int64

In [27]:
val_df = pd.read_csv('validation.csv')
val_df['spam'].value_counts()

spam
0    850
1    296
Name: count, dtype: int64