In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## ZeroShot learning for SIB-200 dataset by generating text

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings
from transformers import XGLMTokenizer, XGLMForCausalLM

## Zero-Shot

In [None]:
# model_address = "facebook/xglm-564M"
# model_address = "facebook/xglm-1.7B"
# model_address = "facebook/xglm-2.9B"
model_address = "facebook/xglm-7.5B"


gd_path_input = "/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/Github Code/"
gd_path_output = "/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/"

data_directory = gd_path_input + "sib-200/data/annotated"
output_directory = gd_path_output + "MT-Task/sib-200/" + model_address[model_address.find('/')+1:] + "/"

In [None]:
# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address, padding_side='left')
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def translate_batch(texts, input_language, output_language="English", batch_size=32):
    with torch.no_grad():  # Disable gradient calculation
        generated_texts = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            prompts = [f"{input_language}: {text} \n{output_language}:" for text in batch_texts]
            inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
            tokens = tokenizer.batch_encode_plus(prompts, add_special_tokens=True, padding=True, return_tensors="pt")['input_ids']
            result_length = tokens.shape[1] + 100
            generated_batch = model.generate(inputs["input_ids"], max_length=result_length)
            for generated_text in generated_batch:
                generated_texts.append(tokenizer.decode(generated_text, skip_special_tokens=True))
            del inputs
            torch.cuda.empty_cache()
    return generated_texts

# def translate(text, input_language, output_language="English"):
#     prompt = f"{input_language}: {text} \n{output_language}:"
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)
#     tokens = tokenizer.encode(prompt, add_special_tokens=True)
#     result_length = len(tokens) + 100
#     generated_text = tokenizer.decode(model.generate(inputs["input_ids"], max_length=result_length)[0])
#     del inputs
#     return generated_text


# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

language_df = pd.read_excel("/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/SIB-200 languages - ACL.xlsx")

for index, row in language_df.iterrows():
# for index, row in language_df[:1].iterrows():
    language = row['Language Name']
    folder = row['Folder Name']
    if language == "English":
        continue
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'translated_text'])

    subdir = os.path.join(data_directory, folder)

    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Check if the output file already exists
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            if os.path.exists(results_file_path):
                print(f"Output file {results_file_path} already exists. Skipping...")
                continue

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Get all texts from the DataFrame
            texts = df['text'].tolist()

            # Predict translations using your ZeroShot learning model in batches
            generated_texts = translate_batch(texts=texts, input_language=language, batch_size=8)

            # Append the results to the DataFrame
            results_df['text'] = texts
            results_df['translated_text'] = generated_texts

            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")


## Few-shot

In [None]:
# model_address = "facebook/xglm-564M"
# model_address = "facebook/xglm-1.7B"
model_address = "facebook/xglm-2.9B"
# model_address = "facebook/xglm-7.5B"

n_shot = 2

gd_path_input = "/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/Github Code/"
gd_path_output = "/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/"

data_directory = gd_path_input + "sib-200/data/annotated"
output_directory = gd_path_output + f"MT-Task/sib-200/{n_shot}-shot/" + model_address[model_address.find('/')+1:] + "/"

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address, padding_side='left')
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device, dtype=torch.bfloat16)

In [None]:
def few_shot_maker(input_folder, input_language, output_folder="eng_Latn", output_language="English", n_shots=2):
    # Read the few-shot samples from the input folder
    input_folder_path = os.path.join(data_directory, input_folder)
    output_folder_path = os.path.join(data_directory, output_folder)
    input_df = pd.read_csv(os.path.join(input_folder_path, "train.tsv"), sep='\t')
    output_df = pd.read_csv(os.path.join(output_folder_path, "train.tsv"), sep='\t')
    few_shot = "\n\n".join([f"{input_language}: {input_df['text'][i]} \n{output_language}: {output_df['text'][i]}" for i in range(n_shots)])
    return few_shot

# Function to translate given texts to English
def translate_batch(texts, few_shot_sample, input_language, output_language="English", batch_size=32):
    with torch.no_grad():  # Disable gradient calculation
        generated_texts = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            prompts = [f"{few_shot_sample}\n\n{input_language}: {text} \n{output_language}:" for text in batch_texts]
            inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
            tokens = tokenizer.batch_encode_plus(prompts, add_special_tokens=True, padding=True, return_tensors="pt")['input_ids']
            result_length = tokens.shape[1] + 100
            generated_batch = model.generate(inputs["input_ids"], max_length=result_length)
            for generated_text in generated_batch:
                generated_texts.append(tokenizer.decode(generated_text, skip_special_tokens=True))
            del inputs
            torch.cuda.empty_cache()
    return generated_texts

In [None]:
os.makedirs(output_directory, exist_ok=True)

language_df = pd.read_excel("/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/SIB-200 languages - ACL.xlsx")


# Iterate through rows and compare predicted category with actual category
for index, row in language_df.iterrows():
    language = row['Language Name']
    folder = row['Folder Name']
    print(language)
    if language == "English":
        continue
    if f"{folder}.csv" in os.listdir(output_directory):
        continue
    few_shot_sample = few_shot_maker(folder, language, n_shots=n_shot)

    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'translated_text'])

    subdir = os.path.join(data_directory, folder)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Check if the file is already present
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            if os.path.exists(results_file_path):
                print(f"Output file {results_file_path} already exists. Skipping...")
                continue

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Get all texts from the DataFrame
            texts = df['text'].tolist()

            # Predict translations using your ZeroShot learning model in batches
            generated_texts = translate_batch(texts=texts, few_shot_sample=few_shot_sample, input_language=language)

            # Append the results to the DataFrame
            results_df['text'] = texts
            results_df['translated_text'] = generated_texts

            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{folder}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

In [None]:
# ## FewShot learning for SIB-200 dataset

# from transformers import BloomForCausalLM
# from transformers import BloomTokenizerFast
# import os
# import pandas as pd
# import torch
# import warnings
# from transformers import XGLMTokenizer, XGLMForCausalLM







## Evaluation

In [None]:
!pip install -q evaluate
!pip install -q sacrebleu

In [None]:
import nltk.translate.bleu_score as bleu

gd_path = "/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/MT-task/sib-200"
ref_dir = "/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/Github Code/sib-200/data/annotated/eng_Latn/test.tsv"
# few_shot_path = "/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/MT-task/sib-200/2-shot/"

language_df = pd.read_excel("/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/SIB-200 languages - ACL.xlsx")

In [None]:
language_df.columns

In [None]:
import os
import evaluate
import re

sacrebleu = evaluate.load("sacrebleu")

In [None]:
# language_df = pd.read_excel("/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/SIB-200 languages - ACL.xlsx")

# Function to calculate spBLEU score for a pair of sentences
def calculate_sacrebleu_score(predictions, references):
    print(predictions)
    print(references)
    results = sacrebleu.compute(predictions=predictions,
                                references=references)
    # print(round(results["score"], 1))
    return round(results["score"], 1)

    # reference = reference.split()
    # hypothesis = hypothesis.split()
    # return bleu.sentence_bleu([reference], hypothesis)

# for 2-shot
def extract_text(text):
    occurrences = 0
    for i, char in enumerate(text):
        if text[i:i+8] == "English:":
            occurrences += 1
            if occurrences == 3:
                # Find the index of the newline character
                newline_index = text.find('\n', i)
                if newline_index != -1:
                    return text[i+8:newline_index].strip()
                else:
                    return text[i+8:].strip()
    return ""


# Function to process CSV files in a directory
def process_directory(input_dir, output_dir, n_shot):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    reference_df = pd.read_csv(ref_dir, sep='\t')

    # Process each CSV file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.csv'):
            input_file = os.path.join(input_dir, filename)
            output_file = os.path.join(output_dir, filename)

            # Read CSV into DataFrame
            df = pd.read_csv(input_file)

            # Apply text extraction
            # df['extracted_text'] = df['translated_text'].str.extract(r'English: (.*)')
            # df['extracted_text'] = df['translated_text'].str.extract(r'English: (.*)', flags=re.DOTALL)

            if n_shot == "zero-shot":
                # df['extracted_text'] = df['translated_text'].str.extract(r'English:\s*(.*)')
                df['extracted_text'] = df['translated_text'].str.extract(r'English:\s*([^\n]*)')

            elif n_shot == "2-shot":
                df['extracted_text'] = df['translated_text'].apply(extract_text)

            df['extracted_text'] = df['extracted_text'].astype(str)

            df['eng_text'] = reference_df['text']

            predictions = df['extracted_text'].tolist()
            references = df['eng_text'].apply(lambda x: [x]).tolist()

            # results = sacrebleu.compute(predictions=df['extracted_text'].tolist(), references=[df['eng_text'].tolist()])
            results = sacrebleu.compute(predictions=predictions, references=references)
            sacrebleu_score = round(results["score"], 1)

            if n_shot == "zero-shot":
                language_df.loc[language_df['Folder Name'] == filename[:-4], f'{model_name} scbleu'] = sacrebleu_score
            elif n_shot == "2-shot":
                language_df.loc[language_df['Folder Name'] == filename[:-4], f'{model_name} scbleu 2s'] = sacrebleu_score


            # df['sacrebleu_score'] = df.apply(lambda row: calculate_sacrebleu_score([str(row['extracted_text'])], [row['eng_text']]), axis=1)

            # # Save the DataFrame to a new CSV file in the output directory
            # df.to_csv(output_file, index=False)
            # print(f"Saved - {output_file}")




# model_name = "bloom-560m"
# n_shot = "zero-shot"       # 2-shot

# input_directory = f'{gd_path}/{n_shot}/{model_name}/'
# output_directory = f'{gd_path}/{n_shot}_metrics/{model_name}/'
# process_directory(input_directory, output_directory)

In [None]:
xglm_models = ["xglm-564M", "xglm-1.7B", "xglm-2.9B", "xglm-7.5B"]
bloom_models = ["bloom-560m", "bloom-1b1", "bloom-1b7", "bloom-3b", "bloom-7b1"]
bloomz_models = ["bloomz-560m", "bloomz-1b1", "bloomz-1b7", "bloomz-3b", "bloomz-7b1"]

n_shot = "2-shot"       # 2-shot

for model_name in xglm_models:

    input_directory = f'{gd_path}/{n_shot}/{model_name}/'
    output_directory = f'{gd_path}/{n_shot}_metrics/{model_name}/'
    process_directory(input_directory, output_directory, n_shot)

language_df.to_csv("/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/results.csv", index=False)

In [None]:
n_shot = "2-shot"       # 2-shot

for model_name in bloom_models:

    input_directory = f'{gd_path}/{n_shot}/{model_name}/'
    output_directory = f'{gd_path}/{n_shot}_metrics/{model_name}/'
    process_directory(input_directory, output_directory, n_shot)

In [None]:
# bloomz_models = ["bloomz-7b1"]
n_shot = "2-shot"       # 2-shot

for model_name in bloomz_models:

    input_directory = f'{gd_path}/{n_shot}/{model_name}/'
    output_directory = f'{gd_path}/{n_shot}_metrics/{model_name}/'
    process_directory(input_directory, output_directory, n_shot)

language_df.to_csv("/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/results.csv", index=False)

In [None]:
# input_directory = f'{gd_path}/{n_shot}/{model_name}/'
# df = pd.read_csv(f"{input_directory}ace_Arab.csv")
# df['extracted_text'] = df['translated_text'].str.extract(r'English:\s*(.*)')
# df['extracted_text'][0]

In [None]:
# df["translated_text"][0]

In [None]:
# predictions = ["بوم فيسي جي كرجا ماصل بك ڤرينسيڤ بهوا جي ڤرلو ككواتن ك ڤساڤت اينتي دڠن ڤروتون ڠن نيوترون."]
# references = ["hello there general kenobi"]
# sacrebleu = evaluate.load("sacrebleu")
# results = sacrebleu.compute(predictions=predictions, references=references)

# print(round(results["score"], 1))

In [None]:
language_df = pd.read_excel("/content/drive/MyDrive/PSU Stuff/NLP Lab/LLM Size and Scalability/SIB-200 languages - ACL.xlsx")