In [None]:
import pandas as pd
import os
import sys
import warnings

warnings.simplefilter("ignore")


def split_labels(label_string):
    if pd.isna(label_string) or str(label_string).strip() == "":
        return []
    return [lbl.strip() for lbl in str(label_string).split(",")]

def split_data(df, test_ratio=0.2, random_state=42):
    train_df = df.sample(frac=1 - test_ratio, random_state=random_state)
    test_df = df.drop(train_df.index)
    return train_df, test_df.reset_index(drop=True)


def main():
    filename = sys.argv[1] if len(sys.argv) > 1 else "Sample.csv"
    file_path = os.path.join(sys.path[0], filename)

    if not os.path.exists(file_path):
        print("Dataset file not found.")
        return

    df = pd.read_csv(file_path)

    df = df.dropna()

    train_df, test_df = split_data(df)

    print("\n===== FASTTEXT BINARY TRAIN DATA =====")
    train_df["ft_format_binary"] = (
        "__label__" + train_df["binary_sentiment"] + " " + train_df["clean_text"]
    )
    print(train_df["ft_format_binary"].head(15))

    print("\n===== FASTTEXT MULTI-CLASS TRAIN DATA =====")
    train_df["ft_format_multiclass"] = (
        "__label__" + train_df["sentiment"] + " " + train_df["clean_text"]
    )
    print(train_df["ft_format_multiclass"].head(15))

    print("\n===== FASTTEXT MULTI-LABEL TRAIN DATA =====")
    def format_multilabel(row):
        labels = split_labels(row["emotion_labels"])
        label_str = " ".join(["__label__" + l for l in labels])
        return f"{label_str} {row['clean_text']}"

    train_df["ft_format_multilabel"] = train_df.apply(format_multilabel, axis=1)
    print(train_df["ft_format_multilabel"].head(15))

    print("\nfastText training files generated successfully")
    print("============================================================")
    print("NOTE: fasttext module is not available")
    print("Install it with: pip install fasttext")

# -----------------------------
if __name__ == "__main__":
    main()
