In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
def load_data(file_path):
    # Load data from the given file path
    data = pd.read_csv(file_path)
    return data

In [3]:
from nltk.corpus import stopwords

def preprocess_data(data):
    # Lowercasing
    data['text'] = data['text'].str.lower()

    # Remove stopwords using NLTK
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    return data

In [4]:
def split_data(data):
    # Split data into train, validation, and test sets
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    validation_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

    return train_data, validation_data, test_data

In [5]:
def store_splits(train_data, validation_data, test_data):
    # Convert the data to pandas DataFrame
    train_df = pd.DataFrame(train_data)
    validation_df = pd.DataFrame(validation_data)
    test_df = pd.DataFrame(test_data)

    # Store the splits as CSV files
    train_df.to_csv('train.csv', index=False)
    validation_df.to_csv('validation.csv', index=False)
    test_df.to_csv('test.csv', index=False)

In [6]:
file_path = r'/kaggle/input/spam-email-dataset/emails.csv'
data = load_data(file_path)
preprocessed_data = preprocess_data(data)
train_data, validation_data, test_data = split_data(preprocessed_data)
store_splits(train_data, validation_data, test_data)