In [None]:
# Make few-shot samples for SIB-200 dataset to select n samples for each category
import os
import pandas as pd

n = 2  # Number of samples to select for each category
# Data directory containing multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

# Output directory for saving few-shot samples
output_directory = f"few-shot/sib-200/train-samples/{n}-shot"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory):
    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("train.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Group the DataFrame by category and select the same index for each category
            few_shot_df = df.groupby('category').apply(lambda x: x.iloc[:n]).reset_index(drop=True)

            # Save the few-shot samples to a new CSV file in the output directory
            few_shot_file_path = os.path.join(output_directory, f'{language}.csv')
            few_shot_df.to_csv(few_shot_file_path, index=False)

In [None]:
# Few-Shot learning for SIB-200 dataset by using generation
import numpy as np
np.random.seed(42)

import os
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import XGLMTokenizer, XGLMForCausalLM
from transformers import BloomForCausalLM, BloomTokenizerFast
import warnings
from sklearn.metrics import f1_score

from tqdm import tqdm

n = 2  # Number of samples to select for each category

# Create a list of 204 shuffled arrays
random_array = np.array([np.random.permutation(np.arange(n * 7)) for _ in range(204)])

for model_address in ["bigscience/bloom-560m", "bigscience/bloom-1b1", "bigscience/bloom-1b7", "bigscience/bloom-3b"]:
    # Filter out FutureWarning messages
    warnings.simplefilter(action='ignore', category=FutureWarning)

    # Assuming data directory contains multiple subdirectories with test.tsv files
    data_directory = "sib-200/data/annotated"

    # Output directory for saving DataFrames
    output_directory = f"few-shot/sib-200/{n}-shot/" + model_address[model_address.find('/')+1:] + "/generate"

    if model_address.startswith("facebook"):
        # Load learning model and tokenizer
        model = XGLMForCausalLM.from_pretrained(model_address)
        tokenizer = XGLMTokenizer.from_pretrained(model_address)
    if model_address.startswith("bigscience"):
        # Load learning model and tokenizer
        model = BloomForCausalLM.from_pretrained(model_address)
        tokenizer = BloomTokenizerFast.from_pretrained(model_address)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    alternatives = [" science", "travel", "politics", "sports", "health", "entertainment", "geography"]
    alt_tokens = tokenizer.encode(" ".join(alternatives), add_special_tokens=False)[:-1] # Remove the last token because "geography" is 2 tokens!

    # Function to predict category given text
    def predict_category(prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        tokens = tokenizer.encode(prompt, add_special_tokens=True)
        result_length = len(tokens) + 10
        outputs = model.generate(inputs["input_ids"],
                                 max_length=result_length,
                                 output_scores=True,
                                 return_dict_in_generate=True
                                 )
        scores = outputs.scores[0][0][alt_tokens]
        found_label = alternatives[torch.argmax(scores)]
        confidence = F.softmax(scores, dim=0)[torch.argmax(scores)].item()
        return found_label, confidence


    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Undone set:
    undone = set(os.listdir(data_directory)) - set([string[:-4] for string in os.listdir(output_directory)])

    # Iterate through subdirectories in the data directory
    for language in tqdm(undone, total=len(undone), desc="Languages"):
        # Initialize an empty DataFrame to store results
        results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category'])

        subdir = os.path.join(data_directory, language)
        # Read the test.tsv file into a DataFrame
        file_path = os.path.join(subdir, "test.tsv")
        df = pd.read_csv(file_path, sep='\t')

        # Read few-shot samples into a DataFrame
        few_shot_df = pd.read_csv(os.path.join(f"few-shot/sib-200/train-samples/{n}-shot", f'{language}.csv'))
        few_shot_df['category'] = few_shot_df['category'].replace('science/technology', 'science')

        # Iterate through rows and compare predicted category with actual category
        for index, row in df.iterrows():
            text = row['text']
            actual_category = row['category']

            if model_address.startswith("facebook"):
                # Predict category for XGLM
                with torch.no_grad():  # Disable gradient calculation
                    few_shots_prompt = '\n'.join(few_shot_df.loc[random_array[index]].apply(lambda row: f"What is the category of the following SENTENCE?\nSENTENCE: {row['text']}\nCategory: {row['category']}", axis=1))
                    predicted_category, confidence = predict_category(few_shots_prompt + "\nWhat is the category of the following SENTENCE?\nSENTENCE: " + text + "\nCategory: ")

            if model_address.startswith("bigscience"):
                # Predict category for BLOOM
                with torch.no_grad():  # Disable gradient calculation
                    few_shots_prompt = '\n'.join(few_shot_df.loc[random_array[index]].apply(lambda row: f"This tool labels the category of the sentence.\nSENTENCE: {row['text']}\nLABEL: {row['category']}", axis=1))
                    predicted_category, confidence = predict_category(few_shots_prompt + "\nThis tool labels the category of the sentence.\nOPTIONS:\n- science\n- travel\n- politics\n- sports\n- health\n- entertainment\n- geography\nSENTENCE: " + text + "\nLABEL: ")

            # Append the results to the DataFrame
            results_df = results_df.append({'text': text,
                                            'actual_category': actual_category,
                                            'predicted_category': 'science/technology' if predicted_category == ' science' else predicted_category,
                                            'confidence': confidence
                                            }, ignore_index=True)
            torch.cuda.empty_cache()
        # Save the results DataFrame to a CSV file in the output directory
        results_file_path = os.path.join(output_directory, f'{language}.csv')
        results_df.to_csv(results_file_path, index=False)
    del model
    torch.cuda.empty_cache()
    # print(f"F1 score of {language} is {f1_score(results_df['actual_category'], results_df['predicted_category'], average='macro')}")

In [None]:
import pandas as pd
from sklearn.metrics import f1_score

# Read the Excel file into a DataFrame
df = pd.read_excel("SIB-200 languages - ACL.xlsx")

# Calculte F1 score for each language and add it to the DataFrame
def calculate_f1(language, model, method, n):
    # Read the CSV file into a DataFrame
    try:
        results_df = pd.read_csv(f"few-shot/sib-200/{n}-shot/{model}/{method}/{language}.csv")
    except FileNotFoundError:
        return

    # Replace NaN values with 'N/A'
    results_df = results_df.fillna('N/A')

    # Calculate the F1 score
    f1 = f1_score(results_df['actual_category'], results_df['predicted_category'], average='macro')

    # Add the F1 score to the DataFrame
    df.loc[df['Folder Name'] == language, f'F1 {model} {method} {n}-shot'] = f1

# Iterate through languages and calculate F1 score for each language
for language in df['Folder Name']:
    calculate_f1(language, "bloomz-3b", "generate", 2)
    calculate_f1(language, "bloomz-7b1", "generate", 2)

# Save the updated DataFrame to the Excel file
df.to_excel("SIB-200 languages - ACL.xlsx", index=False)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel("SIB-200 languages - ACL.xlsx")
filtered_df = df[df['Folder Name'].isin(['eng_Latn', 'pes_Arab', 'hin_Deva', 'npi_Deva'])]

# Create a line plot of the F1 scores
plt.figure(figsize=(20, 10))
for language in filtered_df['Folder Name']:
    plt.plot(['Zero-shot', '2-shot', '5-shot'],
             [filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-560M top_logprobs'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-560m top_logprobs 2-shot'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-560m top_logprobs 5-shot'].values[0]],
             label=f'{language} - bloom-560m')

    plt.plot(['Zero-shot', '2-shot', '5-shot'],
             [filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-1b1 top_logprobs'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-1b1 top_logprobs 2-shot'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-1b1 top_logprobs 5-shot'].values[0]],
             label=f'{language} - bloom-1b1')

    plt.plot(['Zero-shot', '2-shot', '5-shot'],
             [filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-1b7 top_logprobs'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-1b7 top_logprobs 2-shot'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-1b7 top_logprobs 5-shot'].values[0]],
             label=f'{language} - bloom-1b7')

    plt.plot(['Zero-shot', '2-shot', '5-shot'],
             [filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-3b top_logprobs'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-3b top_logprobs 2-shot'].values[0],
              filtered_df.loc[filtered_df['Folder Name']==language, 'F1 bloom-3b top_logprobs 5-shot'].values[0]],
             label=f'{language} - bloom-3b')

plt.title('F1 scores for each language')
plt.xlabel('Shot')
plt.ylabel('F1 score')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel("SIB-200 languages - ACL.xlsx")
filtered_df = df[df['Folder Name'].isin(['eng_Latn', 'pes_Arab', 'hin_Deva', 'npi_Deva'])]

# Initialize an empty DataFrame to store reshaped data
reshaped_df = pd.DataFrame()

# Reshape the DataFrame so that each row corresponds to a model
for model in ['bloom-560m', 'bloom-1b1', 'bloom-1b7', 'bloom-3b']:
    model_data = filtered_df[filtered_df.columns[pd.Series(filtered_df.columns).str.startswith(f'F1 {model} top_logprobs')]].copy()
    model_data.columns = ['Zero-shot', '2-shot', '5-shot']
    model_data.loc[:, 'Model'] = model  # Using .loc to assign values
    reshaped_df = pd.concat([reshaped_df, model_data], ignore_index=True)

# Group by the 'Model' column and calculate the mean F1 scores for each shot
grouped_df = reshaped_df.groupby('Model').mean()

# Create a line plot of the mean F1 scores for each model
plt.figure(figsize=(10, 6))
for model in grouped_df.index:
    plt.plot(['Zero-shot', '2-shot', '5-shot'],
             grouped_df.loc[model].values,
             label=f'{model}')

plt.title('Mean F1 scores for each model')
plt.xlabel('Shot')
plt.ylabel('Mean F1 score')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel("SIB-200 languages - ACL.xlsx")
languages = ['eng_Latn', 'pes_Arab', 'hin_Deva', 'npi_Deva']

# Create subplots for each language
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axes = axes.flatten()  # Flatten the 2D array of axes for easier iteration

for idx, language in enumerate(languages):
    # Filter the DataFrame for the current language
    filtered_df = df[df['Folder Name'] == language]

    # Initialize an empty DataFrame to store reshaped data
    reshaped_df = pd.DataFrame()

    # Reshape the DataFrame so that each row corresponds to a model
    for model in ['bloom-560m', 'bloom-1b1', 'bloom-1b7', 'bloom-3b']:
        model_data = filtered_df[filtered_df.columns[pd.Series(filtered_df.columns).str.startswith(f'F1 {model} top_logprobs')]].copy()
        model_data.columns = ['Zero-shot', '2-shot', '5-shot']
        model_data.loc[:, 'Model'] = model  # Using .loc to assign values
        reshaped_df = pd.concat([reshaped_df, model_data], ignore_index=True)

    # Group by the 'Model' column and calculate the mean F1 scores for each shot
    grouped_df = reshaped_df.groupby('Model').mean()

    # Plot the mean F1 scores for each model
    for model in grouped_df.index:
        axes[idx].plot(['Zero-shot', '2-shot', '5-shot'],
                       grouped_df.loc[model].values,
                       label=f'{model}')
    axes[idx].set_title(f'F1 scores for {language}')
    axes[idx].set_xlabel('Shot')
    axes[idx].set_ylabel('Mean F1 score')
    axes[idx].legend()

# Adjust layout to prevent overlap of subplots
plt.tight_layout()
plt.show()
