In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def load_dataset(file_path):
    """
    Loads the dataset from the given file path.
    """
    df = pd.read_csv(file_path)
    df.columns = ["Label", "Message"]
    df["Label"] = df["Label"].str.lower()  # Convert labels to lowercase for consistency
    return df

def split_dataset(df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    """
    Splits the dataset into train, validation, and test sets.
    """
    train_df, temp_df = train_test_split(df, test_size=(1 - train_size), random_state=random_state, stratify=df["Label"])
    validation_df, test_df = train_test_split(temp_df, test_size=(test_size / (test_size + val_size)),
                                              random_state=random_state, stratify=temp_df["Label"])
    return train_df, validation_df, test_df

def save_splits(train_df, validation_df, test_df, output_dir="."):
    """
    Saves train, validation, and test datasets to CSV files.
    """
    train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
    validation_df.to_csv(os.path.join(output_dir, "validation.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
    print("Data split and saved successfully!")

def main():
    """
    Main function to execute the data preparation pipeline.
    """
    os.chdir("/content/drive/MyDrive/AML_Assignments/Assignment1/")  # Change directory if needed
    file_path = "sms_spam_collection.csv"

    # Load and preprocess dataset
    df = load_dataset(file_path)

    # Split into train, validation, and test sets
    train_df, validation_df, test_df = split_dataset(df)

    # Save the splits
    save_splits(train_df, validation_df, test_df)

if __name__ == "__main__":
    main()


Data split and saved successfully!
