In [None]:
import pandas as pd
import numpy as np
import os
import sys
import warnings
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

warnings.simplefilter("ignore")


ENGLISH_STOPWORDS = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", 
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", 
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", 
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", 
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", 
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", 
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", 
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", 
    "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", 
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", 
    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", 
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
}

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    words = text.split()
    return " ".join([w for w in words if w not in ENGLISH_STOPWORDS])

def split_labels(label_string):
    if pd.isna(label_string) or str(label_string).strip() == "":
        return []
    return [label.strip() for label in str(label_string).split(",")]

def split_data(df, test_ratio=0.2, random_state=42):
    train_df = df.sample(frac=1 - test_ratio, random_state=random_state)
    test_df = df.drop(train_df.index)
    return train_df, test_df.reset_index(drop=True)



def main():
    filename = sys.argv[1] if len(sys.argv) > 1 else "Sample.csv"
    file_path = os.path.join(sys.path[0], filename)

    if not os.path.exists(file_path):
        return

    df = pd.read_csv(file_path) if filename.endswith('.csv') else pd.read_excel(file_path)
    text_col = 'text' if 'text' in df.columns else 'review'

    train_df, test_df = split_data(df)
    
    if 'clean_text' not in train_df.columns:
        train_df['clean_text'] = train_df[text_col].apply(clean_text)

    if 'clean_text' not in test_df.columns:
        test_df['clean_text'] = test_df[text_col].apply(clean_text)


    print("\n===== FASTTEXT BINARY TRAIN DATA =====")
    train_df['ft_format_binary'] = "__label__" + train_df['binary_sentiment'] + " " + train_df['clean_text']
    print(train_df['ft_format_binary'].head(15))

    print("\n===== FASTTEXT MULTI-CLASS TRAIN DATA =====")
    train_df['ft_format_multiclass'] = "__label__" + train_df['sentiment'] + " " + train_df['clean_text']
    print(train_df['ft_format_multiclass'].head(15))

    print("\n===== FASTTEXT MULTI-LABEL TRAIN DATA =====")
    def format_ml(row):
        labels = split_labels(row['emotion_labels'])
        label_str = " ".join(["__label__" + l for l in labels])
        return f"{label_str} {row['clean_text']}"
    train_df['ft_format_multilabel'] = train_df.apply(format_ml, axis=1)
    print(train_df['ft_format_multilabel'].head(15))
    print("\n fastText training files generated successfully")


if __name__ == "__main__":
    main()