In [None]:
# importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# function to load the data
def load_data(filepath):
    return pd.read_csv(filepath, sep='\t', names = ['label', 'message'])

In [3]:
df = load_data('data/SMSSpamCollection')

In [4]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.isna().mean()

label      0.0
message    0.0
dtype: float64

In [6]:
# replace spam with 1 and ham with 0
def map(label):
    if label == 'spam':
        return 1
    else:
        return 0

In [7]:
df['label'] = df['label'].apply(map)

In [28]:
df.head()

Unnamed: 0,label,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts may...
3,0,u dun say early hor u c already say
4,0,nah i think go usf life around though


In [None]:
# downloading necessary libraries
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/utpalraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/utpalraj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/utpalraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/utpalraj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# function for preprocessing messages
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Removing special characters and punctuation
    tokens = [token for token in tokens if token.isalpha()]
    
    # Removing numbers
    tokens = [token for token in tokens if not token.isdigit()]
    
    # Removing URLs
    tokens = [token for token in tokens if not token.startswith('http')]
    
    # Removing HTML tags
    tokens = [token for token in tokens if not token.startswith('<')]
    
    # Converting all text to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Removing empty strings
    tokens = [token for token in tokens if token != '']
    
    return ' '.join(tokens)

In [14]:
df['message'] = df['message'].apply(preprocess_text)

In [15]:
df.head()

Unnamed: 0,label,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts may...
3,0,u dun say early hor u c already say
4,0,nah i think go usf life around though


In [21]:
def split_data(df, seed = 42):
    train_df, val_test_df = train_test_split(df, test_size=0.4, random_state = seed)
    val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state = seed)
    return train_df, val_df, test_df


In [22]:
# Split the data into training, validation, and testing sets
train_df, val_df, test_df = split_data(df)

In [23]:
train_df.head()

Unnamed: 0,label,message
1207,1,as sim subscriber selected receive bonus get d...
3143,0,not planned yet going join company jan know ha...
2633,0,i will cal you sir in meeting
1510,0,when u love someone dont make love u much u bu...
4377,1,if prize go another customer t c polo ltd suit...


In [24]:
val_df.head()

Unnamed: 0,label,message
238,0,where wil reach
3921,1,free ringtone reply real poly eg pushbutton do...
1121,0,cancel cheyyamo get money back
4772,0,hi got money da
648,1,private your account statement show point call...


In [25]:
test_df.head()

Unnamed: 0,label,message
3315,0,oh gei that happend tron maybe ill dl
4879,1,no polyphonic tone ur mob every week just txt ...
4463,0,sorry i flaked last night shit seriously goin ...
1199,0,al moan n e thin go wrong fault al de argument...
3632,0,thank princess you sexy


In [26]:
# function to store as a csv file
def store_as_csv(df, name):
    return df.to_csv(name, index=False)

In [27]:
# Save the training, validation, and testing sets to CSV files
store_as_csv(train_df, name = 'train.csv')
store_as_csv(val_df, name = 'validation.csv')
store_as_csv(test_df, name = 'test.csv')