In [13]:
## ZeroShot learning for SIB-200 dataset by generating text

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings
from transformers import XGLMTokenizer, XGLMForCausalLM

model_address = "bigscience/bloom-560m"

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address)
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to translate given text to English
def translate(text, input_language, output_language="English"):
    prompt = f"{input_language}: {text} \n{output_language}:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generated_text = tokenizer.decode(model.generate(inputs["input_ids"], max_length=256)[0])
    return generated_text

# Output directory for saving DataFrames
output_directory = "MT-task/sib-200/" + model_address[model_address.find('/')+1:] + "/"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

language_df = pd.read_excel("SIB-200 languages - ACL.xlsx")

for index, row in language_df[:1].iterrows():
    language = row['Language Name']
    folder = row['Folder Name']
    if language == "English":
        continue
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'translated_text'])

    subdir = os.path.join(data_directory, folder)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']

                with torch.no_grad():  # Disable gradient calculation
                    # Predict category using your ZeroShot learning model
                    generated_text = translate(text=text, input_language=language)
                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'translated_text': generated_text}, ignore_index=True)
                torch.cuda.empty_cache()
            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

Results saved to MT-task/sib-200/bloom-560m/ace_Arab.csv
