# Fastext

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch
import fasttext
import fasttext.util
import numpy as np
from tqdm import tqdm

# Download models if not already available
fasttext.util.download_model('en', if_exists='ignore')
fasttext.util.download_model('ta', if_exists='ignore')

# Load models
ft_model_en = fasttext.load_model('cc.en.300.bin')  # English
ft_model_ta = fasttext.load_model('cc.ta.300.bin')  # Tamil



In [4]:
def preprocess_data(df):
    df['Class'] = df['Class'].str.lower()

    def preprocess_text(text):
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
        text = text.lower()  # Convert to lowercase
        return text

    df['Text_cleaned'] = df['Text'].apply(preprocess_text)
    return df


def get_fasttext_embeddings(texts, model):
    """
    Generate sentence embeddings using FastText.

    Args:
        texts (list): List of input texts.
        model: FastText model.

    Returns:
        torch.Tensor: Embeddings for input texts.
    """
    embeddings = []
    
    for text in tqdm(texts, desc="Generating FastText embeddings"):
        words = text.split()
        word_vectors = [model.get_word_vector(word) for word in words if word in model.words]

        if len(word_vectors) > 0:
            sentence_embedding = np.mean(word_vectors, axis=0)  # Average word embeddings
        else:
            sentence_embedding = np.zeros(300)  # Handle empty text case

        embeddings.append(sentence_embedding)

    return torch.tensor(embeddings, dtype=torch.float32)


def process_and_save_embeddings(csv_path, save_prefix, model, use_third_col=False):
    """
    Process a dataset by generating FastText embeddings and saving them.

    Args:
        csv_path (str): Path to the CSV file.
        save_prefix (str): Prefix for saving the embeddings and labels.
        model: FastText model.
        use_third_col (bool): Whether to use the 'Translated' column (for _eng datasets).
    """
    # Load dataset
    df = pd.read_csv(csv_path)

    # Select text column based on dataset type
    
    texts = df["Translated"].tolist()
    df.drop(columns=["text"], inplace=True)  # Drop the original column
    

    # Convert labels to numerical format
    #df['Class'] = df['Class'].str.lower()
    #df["Class"] = df["Class"].map({'abusive': 1, 'non-abusive': 0})
    labels = df["label"].to_numpy()

    # Generate embeddings
    print(f"Generating embeddings for {csv_path}...")
    embeddings = get_fasttext_embeddings(texts, model)

    # Save embeddings and labels
    np.save(f"{save_prefix}_fasttext_embeddings.npy", embeddings.numpy())
    np.savetxt(f"{save_prefix}_labels.txt", labels, fmt='%d')

    print(f"Saved embeddings and labels for {csv_path} as {save_prefix}_*.npy / .txt")

def process_and_save_embeddings2(csv_path, save_prefix, model, use_third_col=False):
    """
    Process a dataset by generating FastText embeddings and saving them.

    Args:
        csv_path (str): Path to the CSV file.
        save_prefix (str): Prefix for saving the embeddings and labels.
        model: FastText model.
        use_third_col (bool): Whether to use the 'Translated' column (for _eng datasets).
    """
    # Load dataset
    df = pd.read_csv(csv_path)

    # Select text column based on dataset type
    
    texts = df["Translated"].tolist()
    df.drop(columns=["text"], inplace=True)  # Drop the original column

    # Convert labels to numerical format
    #df['Class'] = df['Class'].str.lower()
    #df["Class"] = df["Class"].map({'abusive': 1, 'non-abusive': 0})
    #labels = df["label"].to_numpy()

    # Generate embeddings
    print(f"Generating embeddings for {csv_path}...")
    embeddings = get_fasttext_embeddings(texts, model)

    # Save embeddings and labels
    np.save(f"{save_prefix}_fasttext_embeddings.npy", embeddings.numpy())

    print(f"Saved embeddings for {csv_path} as _*.npy")



In [9]:
# Define language models
language_models = {
    "English": ft_model_en,
    "Tamil": ft_model_ta,
}

# Datasets with language as string keys to lookup the correct FastText model
datasets = [
  #  ("train_tamil", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_train_tamil.csv", False, "Tamil"),
#    ("dev_tamil", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_tamil.csv", False, "Tamil"),
#    ("test_tamil", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_test_tamil.csv", False, "Tamil"),

    ("train_tamil_eng", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_train_english.csv", True, "English"),
    ("dev_tamil_eng", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_english.csv", True, "English"),
#    ("test_tamil_eng", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_tamil.csv", True, "English"),
]

# Process each dataset using the correct model
for name, path, is_eng, lang_key in datasets:
    ft_model = language_models[lang_key]
    process_and_save_embeddings(path, name, ft_model, use_third_col=is_eng)


Generating embeddings for E:\PROJECT\Speech\3\translated_sentences_deep_translator_train_english.csv...


Generating FastText embeddings: 100%|██████████| 5512/5512 [05:27<00:00, 16.81it/s]


Saved embeddings and labels for E:\PROJECT\Speech\3\translated_sentences_deep_translator_train_english.csv as train_tamil_eng_*.npy / .txt
Generating embeddings for E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_english.csv...


Generating FastText embeddings: 100%|██████████| 787/787 [01:23<00:00,  9.37it/s]

Saved embeddings and labels for E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_english.csv as dev_tamil_eng_*.npy / .txt





In [5]:
# Define language models
language_models = {
    "English": ft_model_en,
    "Tamil": ft_model_ta,
}

# Datasets with language as string keys to lookup the correct FastText model
datasets = [
#    ("train_tamil", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_train_tamil.csv", False, "Tamil"),
#    ("dev_tamil", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_tamil.csv", False, "Tamil"),
 #   ("test_tamil", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_test_tamil.csv", False, "Tamil"),

#    ("train_tamil_eng", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_tamil.csv", True, "English"),
#    ("dev_tamil_eng", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_dev_tamil.csv", True, "English"),
    ("test_tamil_eng", r"E:\PROJECT\Speech\3\translated_sentences_deep_translator_test_tamil.csv", True, "English"),
]

# Process each dataset using the correct model
for name, path, is_eng, lang_key in datasets:
    ft_model = language_models[lang_key]
    process_and_save_embeddings2(path, name, ft_model, use_third_col=is_eng)


Generating embeddings for E:\PROJECT\Speech\3\translated_sentences_deep_translator_test_tamil.csv...


Generating FastText embeddings: 100%|██████████| 1576/1576 [06:30<00:00,  4.03it/s]

Saved embeddings for E:\PROJECT\Speech\3\translated_sentences_deep_translator_test_tamil.csv as _*.npy



  return torch.tensor(embeddings, dtype=torch.float32)
