In [11]:
## ZeroShot learning for SIB-200 dataset on Bloom-560M model with sampling

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

# Load ZeroShot learning model and tokenizer
model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m")
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of labels to use for ZeroShot learning
list_of_labels = ["science/technology", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Function to predict category given text
def predict_category(text):
    prompt = f"\"{text}\" What category does this sentence belong to? {', '.join(list_of_labels)}?? The correct answer is:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    generated_text = tokenizer.decode(model.generate(inputs["input_ids"],
                                                     max_length=result_length,
                                                     do_sample=True,
                                                     top_k=50,
                                                     top_p=0.9
                                                     )[0])

    del inputs
    
    found_labels = 0
    found_label = ""

    for label in list_of_labels:
        if label in generated_text[generated_text.find('??')+3:].lower():
            found_labels += 1
            found_label = label

    if found_labels == 1:
        return found_label, generated_text
    else:
        return "N/A", generated_text

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/Bloom-560M/sampling"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory):
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category', 'generated_text'])
    
    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']

                # Predict category using your ZeroShot learning model
                predicted_category, generated_text = predict_category(text)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': predicted_category,
                                                'generated_text': generated_text}, ignore_index=True)
                torch.cuda.empty_cache()
            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{language}.csv')
            results_df.to_csv(results_file_path, index=False)
            
            print(f"Results saved to {results_file_path}")

Results saved to zero-shot/sib-200/Bloom-560M/sampling/bjn_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/epo_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/kas_Deva.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/mni_Beng.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/guj_Gujr.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/lvs_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/kat_Geor.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/asm_Beng.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/nus_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/ibo_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/ewe_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/acq_Arab.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/run_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sampling/zul_Latn.csv
Results saved to zero-shot/sib-200/Bloom-560M/sa

In [1]:
## ZeroShot learning for SIB-200 dataset on Bloom-560M model with beam search

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

# Load ZeroShot learning model and tokenizer
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m")
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of labels to use for ZeroShot learning
list_of_labels = ["science/technology", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Function to predict category given text
def predict_category(text):
    prompt = f"\"{text}\" What category does this sentence belong to? {', '.join(list_of_labels)}?? The correct answer is:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    generated_text = tokenizer.decode(model.generate(inputs["input_ids"],
                                                     max_length=result_length,
                                                     num_beams=4,
                                                     no_repeat_ngram_size=2,
                                                     early_stopping=True
                                                     )[0])

    del inputs

    found_labels = 0
    found_label = ""

    for label in list_of_labels:
        if label in generated_text[generated_text.find('??')+3:].lower():
            found_labels += 1
            found_label = label

    if found_labels == 1:
        return found_label, generated_text
    else:
        return "N/A", generated_text

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/Bloomz-560M/beam-search"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory):
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category', 'generated_text'])

    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']

                # Predict category using your ZeroShot learning model
                predicted_category, generated_text = predict_category(text)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': predicted_category,
                                                'generated_text': generated_text}, ignore_index=True)
                torch.cuda.empty_cache()
            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{language}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/bjn_Latn.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/epo_Latn.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/kas_Deva.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/mni_Beng.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/guj_Gujr.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/lvs_Latn.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/kat_Geor.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/asm_Beng.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/nus_Latn.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/ibo_Latn.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/ewe_Latn.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/acq_Arab.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/run_Latn.csv
Results saved to zero-shot/sib-200/Bloomz-560M/beam-search/zul_L

In [1]:
## ZeroShot learning for SIB-200 dataset on PolyLM model with sampling

from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import pandas as pd
import torch
import warnings

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

model_path = "DAMO-NLP-MT/polylm-1.7b"

# Load ZeroShot learning model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of labels to use for ZeroShot learning
list_of_labels = ["science/technology", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Function to predict category given text
def predict_category(text):
    prompt = f"\"{text}\" What category does this sentence belong to? {', '.join(list_of_labels)}?? The correct answer is:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    generated_text = tokenizer.decode(model.generate(inputs["input_ids"],
                                                     attention_mask=inputs.attention_mask,
                                                     max_length=result_length,
                                                     do_sample=True,
                                                     top_k=50,
                                                     top_p=0.9,
                                                     pad_token_id=tokenizer.eos_token_id
                                                     )[0])

    del inputs
    
    found_labels = 0
    found_label = ""

    for label in list_of_labels:
        if label in generated_text[generated_text.find('??')+3:].lower():
            found_labels += 1
            found_label = label

    if found_labels == 1:
        return found_label, generated_text
    else:
        return "N/A", generated_text

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/PolyLM-1.7b/sampling"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory):
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category', 'generated_text'])

    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']

                # Predict category using your ZeroShot learning model
                predicted_category, generated_text = predict_category(text)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': predicted_category,
                                                'generated_text': generated_text}, ignore_index=True)
                torch.cuda.empty_cache()
            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{language}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

Results saved to zero-shot/sib-200/PolyML-1b/sampling/bjn_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/epo_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/kas_Deva.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/mni_Beng.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/guj_Gujr.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/lvs_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/kat_Geor.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/asm_Beng.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/nus_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/ibo_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/ewe_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/acq_Arab.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/run_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/zul_Latn.csv
Results saved to zero-shot/sib-200/PolyML-1b/sampling/kea_Latn

In [1]:
## ZeroShot learning for SIB-200 dataset on PolyLM model with beam search

from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import pandas as pd
import torch
import warnings

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

model_path = "DAMO-NLP-MT/polylm-1.7b"

# Load ZeroShot learning model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of labels to use for ZeroShot learning
list_of_labels = ["science/technology", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Function to predict category given text
def predict_category(text):
    prompt = f"\"{text}\" What category does this sentence belong to? {', '.join(list_of_labels)}?? The correct answer is:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    generated_text = tokenizer.decode(model.generate(inputs["input_ids"],
                                                     attention_mask=inputs.attention_mask,
                                                     max_length=result_length,
                                                     num_beams=4,
                                                     no_repeat_ngram_size=2,
                                                     early_stopping=True,
                                                     pad_token_id=tokenizer.eos_token_id
                                                     )[0])

    del inputs

    found_labels = 0
    found_label = ""

    for label in list_of_labels:
        if label in generated_text[generated_text.find('??')+3:].lower():
            found_labels += 1
            found_label = label

    if found_labels == 1:
        return found_label, generated_text
    else:
        return "N/A", generated_text

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/PolyLM-1.7b/beam-search"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
for language in os.listdir(data_directory)[57:58]:
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category', 'generated_text'])

    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']

                # Predict category using your ZeroShot learning model
                predicted_category, generated_text = predict_category(text)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': predicted_category,
                                                'generated_text': generated_text}, ignore_index=True)
                torch.cuda.empty_cache()
            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{language}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

Results saved to zero-shot/sib-200/PolyML-1b/beams-search/shn_Mymr.csv


In [2]:
import pandas as pd
from sklearn.metrics import f1_score

# Read the Excel file into a DataFrame
df = pd.read_excel("SIB-200 languages - ACL.xlsx")

# Calculte F1 score for each language and add it to the DataFrame
def calculate_f1(language, model, method):
    # Read the CSV file into a DataFrame
    results_df = pd.read_csv(f"zero-shot/sib-200/{model}/{method}/{language}.csv")

    # Replace NaN values with 'N/A'
    results_df = results_df.fillna('N/A')

    # Calculate the F1 score
    f1 = f1_score(results_df['actual_category'], results_df['predicted_category'], average='macro')

    # Add the F1 score to the DataFrame
    df.loc[df['Folder Name'] == language, f'F1 {model} {method}'] = f1

# Iterate through languages and calculate F1 score for each language
for language in df['Folder Name']:
    # calculate_f1(language, "Bloom-560M", "sampling")
    calculate_f1(language, "Bloomz-560M", "beam-search")
    # calculate_f1(language, "PolyLM-1.7b", "sampling")
    # calculate_f1(language, "PolyLM-1.7b", "beam-search")

# Save the updated DataFrame to the Excel file
df.to_excel("SIB-200 languages - ACL.xlsx", index=False)

In [1]:
#Ameeta
## ZeroShot learning for SIB-200 dataset on Bloom-560M model with sampling

from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import os
import pandas as pd
import torch
import warnings

# Filter out FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Assuming data directory contains multiple subdirectories with test.tsv files
data_directory = "sib-200/data/annotated"

# Load ZeroShot learning model and tokenizer
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-7b1")
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-7b1")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of labels to use for ZeroShot learning
list_of_labels = ["science", "travel", "politics", "sports", "health", "entertainment", "geography"]

# Function to predict category given text
def predict_category(text):
    prompt = f"\"{text}\" What category does this sentence belong to? {', '.join(list_of_labels)}?? The correct answer is:"
    # prompt = f"Classify each sentence into one of 7 classes: [{', '.join(list_of_labels)}] \n Sentence: {text} \n Class:"
    # prompt = f"Here is a sentence: \"{text}\" This is list of categories: {', '.join(list_of_labels)}. \n What category does this sentence belong to? Give me the correct category without extra text. "
    # prompt = f"SENTENCE:\n {text} \n Is this SENTENCE science, travel, politics, sports, health, entertainment or geography? \nOPTIONS:\n-science \n-travel \n-politics \n-sports \n-health \n-entertainment \n-geography \n-ANSWER:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    result_length = len(tokens) + 10
    generated_text = tokenizer.decode(model.generate(inputs["input_ids"],
                                                     max_length=result_length,
                                                     do_sample=True,
                                                     top_k=50,
                                                     top_p=0.9
                                                     )[0])

    del inputs

    found_labels = 0
    found_label = ""

    for label in list_of_labels:
        if label in generated_text[generated_text.find('??')+3:].lower():
            found_labels += 1
            found_label = label

    if found_labels == 1:
        return found_label, generated_text
    else:
        return "N/A", generated_text

# Output directory for saving DataFrames
output_directory = "zero-shot/sib-200/Bloom-560M/sampling/Ameeta"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate through subdirectories in the data directory
# for language in os.listdir(data_directory):
for language in ['eng_Latn']:
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['text', 'actual_category', 'predicted_category', 'generated_text'])

    subdir = os.path.join(data_directory, language)
    for file in os.listdir(subdir):
        # Check if the file is a test.tsv file
        if file.endswith("test.tsv"):
            file_path = os.path.join(subdir, file)

            # Read the test.tsv file into a DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Iterate through rows and compare predicted category with actual category
            for index, row in df.iterrows():
                text = row['text']
                actual_category = row['category']

                # Predict category using your ZeroShot learning model
                predicted_category, generated_text = predict_category(text)

                # Append the results to the DataFrame
                results_df = results_df.append({'text': text,
                                                'actual_category': actual_category,
                                                'predicted_category': predicted_category,
                                                'generated_text': generated_text}, ignore_index=True)
                torch.cuda.empty_cache()
            # Save the results DataFrame to a CSV file in the output directory
            results_file_path = os.path.join(output_directory, f'{language}.csv')
            results_df.to_csv(results_file_path, index=False)

            print(f"Results saved to {results_file_path}")

Downloading config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/14.1G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/223 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

OutOfMemoryError: HIP out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacty of 15.98 GiB of which 202.00 MiB is free. Of the allocated memory 15.58 GiB is allocated by PyTorch, and 752.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF