In [None]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#  Load the data from a given file path

In [None]:
def load_data(file_path):
    # Assuming the data is tab-separated and lacks a header row, typical of the SMSSpamCollection
    data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
    return data


In [None]:
file_path = '/content/SMSSpamCollection'
data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])


In [None]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocess the data

In [None]:
missing_values = data.isnull().sum()
print(missing_values)

label      0
message    0
dtype: int64


In [None]:
def preprocess_data(data):
    # Check initial data state
    print("Initial data check: \n", data.head())

    # Convert labels to binary format
    data['label'] = data['label'].map({'ham': 0, 'spam': 1})

    # Ensure all message data is string type
    data['message'] = data['message'].astype(str)
    print("After conversion to string:\n", data.head())

    # Convert to lowercase
    data['message'] = data['message'].str.lower()

    # Removing punctuation & special characters
    data['message'] = data['message'].str.replace(f'[{string.punctuation}]', '', regex=True)
    print("After converting to lower case & punctuation removal \n", data.head())

    # Tokenization
    data['message'] = data['message'].apply(word_tokenize)
    print("After tokenization \n", data.head())

    # Removing Stop-words
    stop_words = set(stopwords.words('english'))
    data['message'] = data['message'].apply(lambda x: [word for word in x if word not in stop_words])
    print("After removing stop words \n", data.head())

    # Stemming/Lemmatization
    lemmatizer = WordNetLemmatizer()
    data['message'] = data['message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    print("After lemmatization \n", data.head())

    # Join tokens back to string format
    data['message'] = data['message'].apply(lambda x: ' '.join(x))
    print("After joining tokens \n", data.head())

    print("Final processed data: \n", data.head())
    return data


In [None]:
processed_data = preprocess_data(data)

Initial data check: 
   label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
After conversion to string:
    label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
After converting to lower case & punctuation removal 
    label                                            message
0      0  go until jurong point crazy available only in ...
1      0                            ok lar joking wif u oni


# Split the data into train/validation/test

In [None]:
from sklearn.model_selection import train_test_split

def split_data(processed_data):
    # Split the data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(processed_data['message'], processed_data['label'], test_size=0.2, random_state=47)

    # Split train into train and validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=47) # validation size = 0.25 x 0.8 = 0.2

    # Create DataFrames from splits
    train_df = pd.DataFrame({'message': X_train, 'label': y_train})
    val_df = pd.DataFrame({'message': X_val, 'label': y_val})
    test_df = pd.DataFrame({'message': X_test, 'label': y_test})

    return train_df, val_df, test_df

In [None]:
train_df, val_df, test_df = split_data(processed_data)

# Store the splits at train.csv/validation.csv/test.csv

In [None]:
def save_splits(train, validation, test, directory='./'):
    train.to_csv(f'{directory}train.csv', index=False)
    validation.to_csv(f'{directory}validation.csv', index=False)
    test.to_csv(f'{directory}test.csv', index=False)


In [None]:
save_splits(train_df, val_df, test_df)