In [4]:
## ZeroShot learning for SIB-200 dataset by generating text

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings
from transformers import XGLMTokenizer, XGLMForCausalLM
from sklearn.metrics import f1_score
import torch.nn.functional as F


model_address = "bigscience/bloomz-560m"

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address)
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of labels to use for ZeroShot learning
list_of_labels = ["science", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Function to predict category given text
def predict_category(text):
    l = "\n- ".join(list_of_labels)
    prompt = f"SENTENCE:\n {text} \n Is this SENTENCE {', '.join(list_of_labels)}? \nOPTIONS:\n-{l}\n-ANSWER: "
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    generated_text = tokenizer.decode(model.generate(inputs["input_ids"],
                                                     max_length=result_length,
                                                     num_beams=4,
                                                     no_repeat_ngram_size=2,
                                                     early_stopping=True
                                                     )[0])

    del inputs

    found_labels = 0
    found_label = ""

    for label in list_of_labels:
        if label in generated_text[generated_text.find('ANSWER')+3:].lower():
            found_labels += 1
            found_label = label

    if found_labels == 1:
        return found_label, generated_text
    else:
        return "N/A", generated_text

# Function to predict category given text
def predict_category2(prompt, alternatives):
    alt_tokens = tokenizer.encode(" " + " ".join(alternatives), add_special_tokens=False)[:-1] # Remove the last token because "geography" is 2 tokens!
    prompt = prompt + '\n'
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    outputs = model.generate(inputs["input_ids"],
                             max_length=result_length,
                             output_scores=True,
                             return_dict_in_generate=True
                             )
    scores = outputs.scores[0][0][alt_tokens]
    found_label = alternatives[torch.argmax(scores)]
    confidence = F.softmax(scores, dim=0)[torch.argmax(scores)].item()
    return found_label

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/" + model_address[model_address.find('/')+1:] + "/generate"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory)[:10]:
# for language in ['sat_Olck', 'shn_Mymr']:
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category', 'generated_text'])

    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']
                
                with torch.no_grad():  # Disable gradient calculation
                    # Predict category using your ZeroShot learning model
                    predicted_category, generated_text = predict_category(text)
                    top_next_word = predict_category2(text, list_of_labels)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': 'science/technology' if predicted_category == 'science' else predicted_category,
                                                'generated_text': generated_text,
                                                'next_token': top_next_word}, ignore_index=True)
                torch.cuda.empty_cache()
            # Save the results DataFrame to a CSV file in the output directory
            # results_file_path = os.path.join(output_directory, f'{language}.csv')
            # results_df.to_csv(results_file_path, index=False)
            # 
            # print(f"Results saved to {results_file_path}")
    print(f"F1 score of {language} by all sequence is {f1_score(results_df['actual_category'], results_df['predicted_category'], average='macro')}")
    print(f"F1 score of {language} by next token is {f1_score(results_df['actual_category'], results_df['next_token'], average='macro')}")

In [5]:
# ZeroShot learning for SIB-200 dataset by using top logprobs
import os
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import XGLMTokenizer, XGLMForCausalLM
from transformers import BloomForCausalLM, BloomTokenizerFast
import warnings
from sklearn.metrics import f1_score

model_address = "bigscience/bloom-560m"
# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address)
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_logprobs(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids, output_ids = inputs["input_ids"], inputs["input_ids"][:, 1:]
    outputs = model(**inputs, labels=input_ids)
    logits = outputs.logits
    logprobs = torch.gather(F.log_softmax(logits, dim=2), 2, output_ids.unsqueeze(2))
    return logprobs


def xglm_prediction(prompt, alternatives):
    lprobs = [get_logprobs(prompt + "\n " + alt).sum() for alt in alternatives]
    return alternatives[lprobs.index(max(lprobs))]

# Function to predict category given text
def predict_category(prompt, alternatives):
    alt_tokens = tokenizer.encode(" " + " ".join(alternatives), add_special_tokens=False)[:-1] # Remove the last token because "geography" is 2 tokens!
    prompt = prompt + '\n'
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    outputs = model.generate(inputs["input_ids"],
                             max_length=result_length,
                             output_scores=True,
                             return_dict_in_generate=True
                             )
    scores = outputs.scores[0][0][alt_tokens]
    found_label = alternatives[torch.argmax(scores)]
    confidence = F.softmax(scores, dim=0)[torch.argmax(scores)].item()
    return found_label

alternatives = ["science", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/" + model_address[model_address.find('/')+1:] + "/top_logprobs"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory)[:10]:
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category'])

    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']

                # Predict category using your ZeroShot learning model
                with torch.no_grad():  # Disable gradient calculation
                    predicted_category = xglm_prediction(text, alternatives)
                    top_next_word = predict_category(text, alternatives)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': 'science/technology' if predicted_category == 'science' else predicted_category,
                                                'next_token': top_next_word}, ignore_index=True)
                torch.cuda.empty_cache()
                del text, actual_category, predicted_category
            # Save the results DataFrame to a CSV file in the output directory
            # results_file_path = os.path.join(output_directory, f'{language}.csv')
            # results_df.to_csv(results_file_path, index=False)
            # 
            # print(f"Results saved to {results_file_path}")
    print(f"F1 score of {language} by all sequence is {f1_score(results_df['actual_category'], results_df['predicted_category'], average='macro')}")
    print(f"F1 score of {language} by next token is {f1_score(results_df['actual_category'], results_df['next_token'], average='macro')}")

In [6]:
# ZeroShot learning for SIB-200 dataset by using top logprobs
import os
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import XGLMTokenizer, XGLMForCausalLM
from transformers import BloomForCausalLM, BloomTokenizerFast
import warnings
from sklearn.metrics import f1_score

model_address = "facebook/xglm-564M"
# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

if model_address.startswith("facebook"):
    # Load ZeroShot learning model and tokenizer
    model = XGLMForCausalLM.from_pretrained(model_address)
    tokenizer = XGLMTokenizer.from_pretrained(model_address)
if model_address.startswith("bigscience"):
    # Load ZeroShot learning model and tokenizer
    model = BloomForCausalLM.from_pretrained(model_address)
    tokenizer = BloomTokenizerFast.from_pretrained(model_address)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_logprobs(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids, output_ids = inputs["input_ids"], inputs["input_ids"][:, 1:]
    outputs = model(**inputs, labels=input_ids)
    logits = outputs.logits
    logprobs = torch.gather(F.log_softmax(logits, dim=2), 2, output_ids.unsqueeze(2))
    return logprobs


def xglm_prediction(prompt, alternatives):
    lprobs = [get_logprobs(prompt + "\n " + alt).sum() for alt in alternatives]
    return alternatives[lprobs.index(max(lprobs))]

# Function to predict category given text
def predict_category(prompt, alternatives):
    alt_tokens = tokenizer.encode(" " + " ".join(alternatives), add_special_tokens=False)[:-1] # Remove the last token because "geography" is 2 tokens!
    prompt = prompt + '\n'
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    outputs = model.generate(inputs["input_ids"],
                             max_length=result_length,
                             output_scores=True,
                             return_dict_in_generate=True
                             )
    scores = outputs.scores[0][0][alt_tokens]
    found_label = alternatives[torch.argmax(scores)]
    confidence = F.softmax(scores, dim=0)[torch.argmax(scores)].item()
    return found_label

alternatives = ["science", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/" + model_address[model_address.find('/')+1:] + "/top_logprobs"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory)[:10]:
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category'])

    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']

                # Predict category using your ZeroShot learning model
                with torch.no_grad():  # Disable gradient calculation
                    predicted_category = xglm_prediction(text, alternatives)
                    top_next_word = predict_category(text, alternatives)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': 'science/technology' if predicted_category == 'science' else predicted_category,
                                                'next_token': top_next_word}, ignore_index=True)
                torch.cuda.empty_cache()
                del text, actual_category, predicted_category
            # Save the results DataFrame to a CSV file in the output directory
            # results_file_path = os.path.join(output_directory, f'{language}.csv')
            # results_df.to_csv(results_file_path, index=False)
            # 
            # print(f"Results saved to {results_file_path}")
    print(f"F1 score of {language} by all sequence is {f1_score(results_df['actual_category'], results_df['predicted_category'], average='macro')}")
    print(f"F1 score of {language} by next token is {f1_score(results_df['actual_category'], results_df['next_token'], average='macro')}")

In [2]:
import pandas as pd
from sklearn.metrics import f1_score

# Read the Excel file into a DataFrame
df = pd.read_excel("SIB-200 languages - ACL.xlsx")

# Calculte F1 score for each language and add it to the DataFrame
def calculate_f1(language, model, method):
    # Read the CSV file into a DataFrame
    results_df = pd.read_csv(f"zero-shot/sib-200/{model}/{method}/{language}.csv")

    # Replace NaN values with 'N/A'
    results_df = results_df.fillna('N/A')

    # Calculate the F1 score
    f1 = f1_score(results_df['actual_category'], results_df['predicted_category'], average='macro')

    # Add the F1 score to the DataFrame
    df.loc[df['Folder Name'] == language, f'F1 {model} {method}'] = f1

# Iterate through languages and calculate F1 score for each language
for language in df['Folder Name']:
    calculate_f1(language, "xglm-564M", "top_logprobs")
    calculate_f1(language, "xglm-1.7B", "top_logprobs")
    calculate_f1(language, "xglm-2.9B", "top_logprobs")
    calculate_f1(language, "xglm-7.5B", "top_logprobs")
    calculate_f1(language, "bloom-560M", "top_logprobs")
    calculate_f1(language, "bloom-1b1", "top_logprobs")
    calculate_f1(language, "bloom-1b7", "top_logprobs")
    calculate_f1(language, "bloom-3b", "top_logprobs")
    calculate_f1(language, "bloom-7b1", "top_logprobs")
    calculate_f1(language, "bloomz-560M", "generate")
    calculate_f1(language, "bloomz-1b1", "generate")
    calculate_f1(language, "bloomz-1b7", "generate")
    calculate_f1(language, "bloomz-3b", "generate")
    calculate_f1(language, "bloomz-7b1", "generate")

# Save the updated DataFrame to the Excel file
df.to_excel("SIB-200 languages - ACL.xlsx", index=False)