## Zero-shot

In [None]:
## ZeroShot learning for SIB-200 dataset

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings
from transformers import XGLMTokenizer, XGLMForCausalLM

model_address = "bigscience/bloom-1b1"

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address, padding_side='left')
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to translate given texts to English
def translate_batch(texts, input_language, output_language="English", batch_size=5):
    with torch.no_grad():  # Disable gradient calculation
        generated_texts = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            prompts = [f"{input_language}: {text} \n{output_language}:" for text in batch_texts]
            inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
            tokens = tokenizer.batch_encode_plus(prompts, add_special_tokens=True, padding=True, return_tensors="pt")['input_ids']
            result_length = tokens.shape[1] + 100
            generated_batch = model.generate(inputs["input_ids"], max_length=result_length)
            for generated_text in generated_batch:
                generated_texts.append(tokenizer.decode(generated_text, skip_special_tokens=True))
            del inputs
            torch.cuda.empty_cache()
    return generated_texts

# Output directory for saving DataFrames
output_directory = "MT-task/sib-200/zero-shot/" + model_address[model_address.find('/')+1:] + "/"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

language_df = pd.read_excel("SIB-200 languages - ACL.xlsx")

# Iterate through rows and compare predicted category with actual category
for index, row in language_df.iterrows():
    language = row['Language Name']
    folder = row['Folder Name']
    print(language)
    if language == "English":
        continue
    if f"{folder}.csv" in os.listdir(output_directory):
        continue
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'translated_text'])

    subdir = os.path.join(data_directory, folder)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)
            
            # Check if the file is already present
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            if os.path.exists(results_file_path):
                print(f"Output file {results_file_path} already exists. Skipping...")
                continue

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Get all texts from the DataFrame
            texts = df['text'].tolist()

            # Predict translations using your ZeroShot learning model in batches
            generated_texts = translate_batch(texts=texts, input_language=language)

            # Append the results to the DataFrame
            results_df['text'] = texts
            results_df['translated_text'] = generated_texts

            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

## Few-shot

In [1]:
## FewShot learning for SIB-200 dataset

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings
from transformers import XGLMTokenizer, XGLMForCausalLM

model_address = "bigscience/bloom-1b1"
n_shot = 2

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address, padding_side='left')
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device, dtype=torch.bfloat16)

def few_shot_maker(input_folder, input_language, output_folder="eng_Latn", output_language="English", n_shots=2):
    # Read the few-shot samples from the input folder
    input_folder_path = os.path.join(data_directory, input_folder)
    output_folder_path = os.path.join(data_directory, output_folder)
    input_df = pd.read_csv(os.path.join(input_folder_path, "train.tsv"), sep='\t')
    output_df = pd.read_csv(os.path.join(output_folder_path, "train.tsv"), sep='\t')
    few_shot = "\n\n".join([f"{input_language}: {input_df['text'][i]} \n{output_language}: {output_df['text'][i]}" for i in range(n_shots)])
    return few_shot

# Function to translate given texts to English
def translate_batch(texts, few_shot_sample, input_language, output_language="English", batch_size=16):
    with torch.no_grad():  # Disable gradient calculation
        generated_texts = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            prompts = [f"{few_shot_sample}\n\n{input_language}: {text} \n{output_language}:" for text in batch_texts]
            inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
            tokens = tokenizer.batch_encode_plus(prompts, add_special_tokens=True, padding=True, return_tensors="pt")['input_ids']
            result_length = tokens.shape[1] + 100
            generated_batch = model.generate(inputs["input_ids"], max_length=result_length)
            for generated_text in generated_batch:
                generated_texts.append(tokenizer.decode(generated_text, skip_special_tokens=True))
            del inputs
            torch.cuda.empty_cache()
    return generated_texts

# Output directory for saving DataFrames
output_directory = f"MT-task/sib-200/{n_shot}-shot/" + model_address[model_address.find('/')+1:] + "/"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

language_df = pd.read_excel("SIB-200 languages - ACL.xlsx")

# Iterate through rows and compare predicted category with actual category
for index, row in language_df.iterrows():
    language = row['Language Name']
    folder = row['Folder Name']
    print(language)
    if language == "English":
        continue
    if f"{folder}.csv" in os.listdir(output_directory):
        continue
    few_shot_sample = few_shot_maker(folder, language, n_shots=n_shot)    
    
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'translated_text'])

    subdir = os.path.join(data_directory, folder)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Check if the file is already present
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            if os.path.exists(results_file_path):
                print(f"Output file {results_file_path} already exists. Skipping...")
                continue

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Get all texts from the DataFrame
            texts = df['text'].tolist()

            # Predict translations using your ZeroShot learning model in batches
            generated_texts = translate_batch(texts=texts, few_shot_sample=few_shot_sample, input_language=language)

            # Append the results to the DataFrame
            results_df['text'] = texts
            results_df['translated_text'] = generated_texts

            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

Aceh
Results saved to MT-task/sib-200/2-shot/bloom-1b1/ace_Arab.csv
Aceh
Results saved to MT-task/sib-200/2-shot/bloom-1b1/ace_Latn.csv
Mesopotamian Spoken Arabic
Results saved to MT-task/sib-200/2-shot/bloom-1b1/acm_Arab.csv
Arabic, Ta’izzi-Adeni Spoken


KeyboardInterrupt: 

## Remove duplicate sentences and clean the output

In [71]:
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize

# Function to remove duplicate sentences from a given text
def remove_duplicate_sentences(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Create a set to store unique sentences
    unique_sentences = set()

    # Iterate through each sentence
    for sentence in sentences:
        # Check if the sentence is not already in the set
        if sentence not in unique_sentences:
            # Add the unique sentence to the set
            unique_sentences.add(sentence)

    # Reconstruct the text with unique sentences
    unique_text = ' '.join(unique_sentences)

    return unique_text

zero_shot_output = "MT-task/sib-200/zero-shot/"
few_shot_output = "MT-task/sib-200/2-shot/"

# Load all files in the zero-shot output directory and its subdirectories
for root, dirs, files in os.walk(zero_shot_output):
    for file in files:
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(root, file))
            # get text after "/nEnglish:" and remove repeated text
            print(os.path.join(root, file))
            df['clean_output'] = df['translated_text'].str.split("English:").str[1].str.strip()
            df['clean_output'] = df['clean_output'].apply(lambda x: remove_duplicate_sentences(x) if isinstance(x, str) else x)
            # Save the cleaned DataFrame to a CSV file
            df.to_csv(os.path.join(root, file), index=False)

# Load all files in the few-shot output directory and its subdirectories
for root, dirs, files in os.walk(few_shot_output):
    for file in files:
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(root, file))
            # get text after "/nEnglish:" and remove repeated text
            print(os.path.join(root, file))
            df['clean_output'] = df['translated_text'].str.split("English:").str[3].str.strip()
            df['clean_output'] = df['clean_output'].apply(lambda x: remove_duplicate_sentences(x) if isinstance(x, str) else x)
            # Save the cleaned DataFrame to a CSV file
            df.to_csv(os.path.join(root, file), index=False)

MT-task/sib-200/zero-shot/bloomz-7b1/crh_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/nya_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/amh_Ethi.csv
MT-task/sib-200/zero-shot/bloomz-7b1/min_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/mos_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/kat_Geor.csv
MT-task/sib-200/zero-shot/bloomz-7b1/gaz_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/zul_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/uig_Arab.csv
MT-task/sib-200/zero-shot/bloomz-7b1/khm_Khmr.csv
MT-task/sib-200/zero-shot/bloomz-7b1/nus_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/bak_Cyrl.csv
MT-task/sib-200/zero-shot/bloomz-7b1/jpn_Jpan.csv
MT-task/sib-200/zero-shot/bloomz-7b1/mal_Mlym.csv
MT-task/sib-200/zero-shot/bloomz-7b1/spa_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/fij_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/slv_Latn.csv
MT-task/sib-200/zero-shot/bloomz-7b1/tat_Cyrl.csv
MT-task/sib-200/zero-shot/bloomz-7b1/aeb_Arab.csv
MT-task/sib-200/zero-shot/bloomz-7b1/tgl_Latn.csv
