In [1]:
import pandas as pd
import random
import nltk
# nltk.download('punkt')
from needle_dictionary import Needles, Cities

def haystack_builder(df, k, tokenizer, buffer_size=100):
    # Initialize variables
    concatenated_body = ""
    total_tokens = 0
    articles_added = 0
    first_article_added = False  # Flag to indicate whether the first article has been added

    # Iterate through each article in the DataFrame
    for index, row in df.iterrows():
        # Token size of the article body
        try:
            article_length = len(tokenizer.tokenize(row["body"]))
        except AttributeError:
            try:
                article_length = len(tokenizer.encode(row["body"]))
            except AttributeError:
                article_length = tokenizer(row["body"]).total_tokens
                time.sleep(1)

        # Check if adding the current article would exceed the token limit
        if not first_article_added and total_tokens + article_length > k * 1000 - buffer_size:
            # Skip the first article if it exceeds the token limit
            continue

        # Set the flag to indicate that the first article has been added
        first_article_added = True

        # Check if adding the current article would exceed the token limit
        if total_tokens + article_length <= k * 1000 - buffer_size:
            # Concatenate the body of the article
            concatenated_body += row["body"] + " "

            # Update total tokens and count of articles added
            total_tokens += article_length
        else:
            # Sentence tokenize the last article
            sentences = nltk.sent_tokenize(row["body"])
            for sentence in sentences:
                # Token length of the sentence
                try:
                    sentence_length = len(tokenizer.tokenize(sentence))
                except AttributeError:
                    try:
                        sentence_length = len(tokenizer.encode(sentence))
                    except AttributeError:
                        sentence_length = tokenizer(sentence).total_tokens
                        time.sleep(1)
                # Check if adding the sentence would exceed the token limit
                if total_tokens + sentence_length <= k * 1000 - buffer_size:
                    # Concatenate the sentence
                    concatenated_body += sentence + " "
                    # Update total tokens
                    total_tokens += sentence_length
                else:
                    # Break the loop if adding the sentence would exceed the token limit
                    break
            # Break the loop after processing the last article
            break
    print(f"Total tokens: {total_tokens}")
    return concatenated_body


def split_into_parts(text, tokenizer, percentage):
    if percentage == 0:
        return ["", text]
    elif percentage == 100:
        return [text, ""]
    else:
        # Splitting text into sentences
        sentences = nltk.sent_tokenize(text)

        # Calculating number of tokens
        try:
            total_tokens = len(tokenizer.tokenize(text))
        except AttributeError:
            try:
                total_tokens = len(tokenizer.encode(text))
            except AttributeError:
                total_tokens = tokenizer(text).total_tokens
                time.sleep(1)

        # Calculating the desired number of tokens for first part
        tokens_first_part = int(total_tokens * percentage/100)

        approximated_i = [1, 4]

        while approximated_i[-1] != approximated_i[-2]:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i[-1]])).total_tokens
                    time.sleep(1)
            approximated_i.append(int(tokens_first_part / current_length * approximated_i[-1]))
            if approximated_i[-1] in approximated_i[:-2]:
                approximated_i.append(min(approximated_i[-3:-1]))
                break
        while True:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i[-1]])).total_tokens
                    time.sleep(1)
            if current_length < tokens_first_part:
                approximated_i.append(approximated_i[-1] + 1)
            else:
                break
        parts = [' '.join(sentences[:approximated_i[-1]]), ' '.join(sentences[approximated_i[-1]:])]
        print(k, approximated_i)
        return parts


def generate_random_number(n_digits):
    lower_bound = 10**(n_digits-1)
    upper_bound = (10**n_digits)-1
    return random.randint(lower_bound, upper_bound)


def needles_builder(language, n_digits=7):
    # select a random city
    city = random.choice(Cities)[language]
    # select a random number with n_digits digits
    rnd_number = generate_random_number(n_digits)
    # create the needle
    needle = Needles[language].format(city=city, number=rnd_number)
    return needle, city, rnd_number

In [3]:
import nltk
from transformers import AutoTokenizer
import tiktoken
import pandas as pd
from transformers import GPT2TokenizerFast, PreTrainedTokenizerFast
import google.generativeai as genai
from config import api_key_google
from huggingface_hub import login
import time

tokenizer_name = 'GPT4o'

match tokenizer_name:
    case "GPT4":
        tokenizer = tiktoken.encoding_for_model("gpt-4")
    case "GPT4o":
        tokenizer = tiktoken.encoding_for_model("gpt-4o")
    case "Claude":
        tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
    case "Gemini":
        genai.configure(api_key=api_key_google)
        model = genai.GenerativeModel(model_name="gemini-1.0-pro")
        tokenizer = model.count_tokens
    case "YaRN":
        tokenizer = AutoTokenizer.from_pretrained("NousResearch/Yarn-Llama-2-7b-128k")
    case "Claude_downloaded":
        tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizers/claude-v1-tokenization.json")
    case "Llama":
        login("your_Huggingface_token")
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
    case _:
        raise ValueError("Unknown tokenizer_name specified. Please provide a valid tokenizer_name.")

ks = [2, 8, 16, 32, 64]

for language in ['English', 'Somali', 'Swahili', 'Indonesian', 'Azeri', 'Vietnamese']:
    df_return = pd.DataFrame()
    df = pd.read_csv(f"datasets/Hastacks/mBBC_2024/bbc_{language}.csv")
    for k in ks:
        haystack = haystack_builder(df, k, tokenizer)
        
        for percentage in [0, 25, 50, 75, 100]:
            parts = split_into_parts(haystack, tokenizer, percentage)
            needle, city, rnd_number = needles_builder(language)
            data = {
                "context_length": f"{k}k",
                "position": percentage,
                "text": parts[0] + ' ' + needle + ' ' + parts[1],
                "city": city,
                "label": rnd_number
            }
            df_return = pd.concat([df_return, pd.DataFrame([data])], ignore_index=True)
            df_return.to_csv(f"datasets/Haystack_Needles/Incongruous/1-Needle/mBBC_2024/{tokenizer_name}/{language}_needles.csv", index=False)

Total tokens: 1839
2 [1, 4, 5, 5, 6, 7, 8]
2 [1, 4, 10, 15, 15, 16, 17, 18]
2 [1, 4, 15, 23, 22, 22, 23]
Total tokens: 7887
8 [1, 4, 21, 31, 34, 34, 35, 36]
8 [1, 4, 43, 73, 72, 72, 73]
8 [1, 4, 65, 99, 124, 123, 122, 123, 122, 123]
Total tokens: 15891
16 [1, 4, 43, 74, 73, 73, 74]
16 [1, 4, 87, 156, 161, 162, 163, 164, 164, 165, 166, 167]
16 [1, 4, 130, 239, 247, 252, 254, 255, 255, 256, 257]
Total tokens: 31889
32 [1, 4, 87, 156, 161, 163, 164, 165, 166, 166, 167, 168]
32 [1, 4, 175, 337, 367, 360, 358, 362, 357, 362, 357, 358, 359, 360]
32 [1, 4, 262, 522, 509, 509, 510]
Total tokens: 63852
64 [1, 4, 175, 338, 366, 360, 359, 363, 358, 363, 358, 359, 360]
64 [1, 4, 350, 720, 733, 737, 738, 739, 739, 740]
64 [1, 4, 526, 1022, 1062, 1061, 1060, 1060, 1061]
Total tokens: 1871
2 [1, 4, 1, 1, 2]
2 [1, 4, 2, 3, 2, 2, 3]
2 [1, 4, 4, 5, 6]
Total tokens: 7840
8 [1, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 18]
8 [1, 4, 11, 25, 26, 26, 27, 28]
8 [1, 4, 17, 52, 42, 45, 42, 42, 43, 44, 

# Two-needles dataset

In [4]:
import random
def split_into_3_parts(text, tokenizer, percentage):

    # Splitting text into sentences
    sentences = nltk.sent_tokenize(text)

    # Calculating number of tokens
    try:
        total_tokens = len(tokenizer.tokenize(text))
    except AttributeError:
        try:
            total_tokens = len(tokenizer.encode(text))
        except AttributeError:
            total_tokens = tokenizer(text).total_tokens
            time.sleep(1)

    # Calculating the desired number of tokens for first part
    tokens_first_part = int(total_tokens * percentage/100)
    tokens_second_part = int(total_tokens * (percentage + 25)/100)

    approximated_i1 = [1, 4]
    
    if percentage == 0:
        approximated_i1 = [0]
    else:
        while approximated_i1[-1] != approximated_i1[-2]:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i1[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i1[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i1[-1]])).total_tokens
                    time.sleep(1)
            approximated_i1.append(int(tokens_first_part / current_length * approximated_i1[-1]))
            if approximated_i1[-1] in approximated_i1[:-2]:
                approximated_i1.append(min(approximated_i1[-3:-1]))
                break
        while True:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i1[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i1[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i1[-1]])).total_tokens
                    time.sleep(1)
            if current_length < tokens_first_part:
                approximated_i1.append(approximated_i1[-1] + 1)
            else:
                break
    if (percentage + 25) != 100:
        approximated_i2 = [approximated_i1[-1], approximated_i1[-1] + 1]
        while approximated_i2[-1] != approximated_i2[-2]:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i2[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i2[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i2[-1]])).total_tokens
                    time.sleep(1)
            approximated_i2.append(int(tokens_second_part / current_length * approximated_i2[-1]))
            if approximated_i2[-1] in approximated_i2[:-2]:
                approximated_i2.append(min(approximated_i2[-3:-1]))
                break
        while True:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i2[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i2[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i2[-1]])).total_tokens
                    time.sleep(1)
            if current_length < tokens_second_part:
                approximated_i2.append(approximated_i2[-1] + 1)
            else:
                break
    else:
        approximated_i2 = [len(sentences)]
    
    random_choice = random.randint(approximated_i1[-1], approximated_i2[-1])
    
    parts = [' '.join(sentences[:approximated_i1[-1]]), ' '.join(sentences[approximated_i1[-1]:random_choice]), ' '.join(sentences[random_choice:])]
    print(k, approximated_i1, approximated_i2)
    return parts

In [6]:
import nltk
from transformers import AutoTokenizer
import tiktoken
import pandas as pd
from transformers import GPT2TokenizerFast, PreTrainedTokenizerFast
import google.generativeai as genai
from config import api_key_google
import time

tokenizer_name = 'GPT4o'

match tokenizer_name:
    case "GPT4":
        tokenizer = tiktoken.encoding_for_model("gpt-4")
    case "GPT4o":
        tokenizer = tiktoken.encoding_for_model("gpt-4o")
    case "Claude":
        tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
    case "Gemini":
        genai.configure(api_key=api_key_google)
        model = genai.GenerativeModel(model_name="gemini-1.0-pro")
        tokenizer = model.count_tokens
    case "YaRN":
        tokenizer = AutoTokenizer.from_pretrained("NousResearch/Yarn-Llama-2-7b-128k")
    case "Claude_downloaded":
        tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizers/claude-v1-tokenization.json")
    case "Llama":
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
    case _:
        raise ValueError("Unknown tokenizer_name specified. Please provide a valid tokenizer_name.")

ks = [2, 8, 16, 32, 64]

for language in  ['English', 'Somali', 'Swahili', 'Indonesian', 'Azeri', 'Vietnamese']:
    df_return = pd.DataFrame()
    df = pd.read_csv(f"datasets/Hastacks/mBBC_2024/bbc_{language}.csv")
    for k in ks:
        haystack = haystack_builder(df, k, tokenizer)

        for percentage in [0, 25, 50, 75]:
            parts = split_into_3_parts(haystack, tokenizer, percentage)
            needle1, city1, rnd_number1 = needles_builder(language)
            needle2, city2, rnd_number2 = needles_builder(language)
            data = {
                "context_length": f"{k}k",
                "position": percentage,
                "text": parts[0] + ' ' + needle1 + ' ' + parts[1] + ' ' + needle2 + ' ' + parts[2],
                "city1": city1,
                "label1": rnd_number1,
                "city2": city2,
                "label2": rnd_number2
            }
            df_return = pd.concat([df_return, pd.DataFrame([data])], ignore_index=True)
            df_return.to_csv(f"datasets/Haystack_Needles/Incongruous/2-Needles/{tokenizer_name}/{language}_needles.csv", index=False)

Total tokens: 1839
2 [0] [0, 1, 5, 5, 6, 7, 8]
2 [1, 4, 5, 5, 6, 7, 8] [8, 9, 15, 15, 16, 17, 18]
2 [1, 4, 10, 15, 15, 16, 17, 18] [18, 19, 22, 22, 23]
2 [1, 4, 15, 23, 22, 22, 23] [33]
Total tokens: 7887
8 [0] [0, 1, 22, 31, 34, 34, 35, 36]
8 [1, 4, 21, 31, 34, 34, 35, 36] [36, 37, 72, 72, 73]
8 [1, 4, 43, 73, 72, 72, 73] [73, 74, 109, 123, 122, 123, 122, 123]
8 [1, 4, 65, 99, 124, 123, 122, 123, 122, 123] [166]
Total tokens: 15891
16 [0] [0, 1, 45, 70, 70, 71, 72, 73, 74]
16 [1, 4, 43, 74, 73, 73, 74] [74, 75, 147, 159, 161, 162, 163, 164, 164, 165, 166, 167]
16 [1, 4, 87, 156, 161, 162, 163, 164, 164, 165, 166, 167] [167, 168, 250, 253, 254, 255, 255, 256, 257]
16 [1, 4, 130, 239, 247, 252, 254, 255, 255, 256, 257] [360]
Total tokens: 31889
32 [0] [0, 1, 91, 158, 162, 163, 164, 165, 166, 166, 167, 168]
32 [1, 4, 87, 156, 161, 163, 164, 165, 166, 166, 167, 168] [168, 169, 333, 367, 360, 358, 362, 357, 362, 357, 358, 359, 360]
32 [1, 4, 175, 337, 367, 360, 358, 362, 357, 362, 357, 358

# Three-needles dataset

In [7]:
import random
def split_into_4_parts(text, tokenizer, percentage):

    # Splitting text into sentences
    sentences = nltk.sent_tokenize(text)

    # Calculating number of tokens
    try:
        total_tokens = len(tokenizer.tokenize(text))
    except AttributeError:
        try:
            total_tokens = len(tokenizer.encode(text))
        except AttributeError:
            total_tokens = tokenizer(text).total_tokens
            time.sleep(1)

    # Calculating the desired number of tokens for first part
    tokens_first_part = int(total_tokens * percentage/100)
    tokens_second_part = int(total_tokens * (percentage + 25)/100)

    approximated_i1 = [1, 4]

    if percentage == 0:
        approximated_i1 = [0]
    else:
        while approximated_i1[-1] != approximated_i1[-2]:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i1[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i1[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i1[-1]])).total_tokens
                    time.sleep(1)
            approximated_i1.append(int(tokens_first_part / current_length * approximated_i1[-1]))
            if approximated_i1[-1] in approximated_i1[:-2]:
                approximated_i1.append(min(approximated_i1[-3:-1]))
                break
        while True:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i1[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i1[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i1[-1]])).total_tokens
                    time.sleep(1)
            if current_length < tokens_first_part:
                approximated_i1.append(approximated_i1[-1] + 1)
            else:
                break
    if (percentage + 25) != 100:
        approximated_i2 = [approximated_i1[-1], approximated_i1[-1] + 1]
        while approximated_i2[-1] != approximated_i2[-2]:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i2[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i2[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i2[-1]])).total_tokens
                    time.sleep(1)
            approximated_i2.append(int(tokens_second_part / current_length * approximated_i2[-1]))
            if approximated_i2[-1] in approximated_i2[:-2]:
                approximated_i2.append(min(approximated_i2[-3:-1]))
                break
        while True:
            try:
                current_length = len(tokenizer.tokenize(" ".join(sentences[:approximated_i2[-1]])))
            except AttributeError:
                try:
                    current_length = len(tokenizer.encode(" ".join(sentences[:approximated_i2[-1]])))
                except AttributeError:
                    current_length = tokenizer(" ".join(sentences[:approximated_i2[-1]])).total_tokens
                    time.sleep(1)
            if current_length < tokens_second_part:
                approximated_i2.append(approximated_i2[-1] + 1)
            else:
                break
    else:
        approximated_i2 = [len(sentences)]

    # Get the range for selection
    start_range = min(approximated_i1[-1], approximated_i2[-1])
    end_range = max(approximated_i1[-1], approximated_i2[-1])
    
    print(start_range, end_range)

    # Select 2 numbers randomly from the range
    try:
        random_numbers = random.sample(range(start_range, end_range + 1), 2)
    except ValueError:
        random_numbers = [start_range, end_range]
    random_numbers.sort()

    parts = [' '.join(sentences[:approximated_i1[-1]]), ' '.join(sentences[approximated_i1[-1]:random_numbers[0]]), ' '.join(sentences[random_numbers[0]:random_numbers[1]]), ' '.join(sentences[random_numbers[1]:])]
    print(k, approximated_i1, approximated_i2)
    return parts

In [9]:
import nltk
from transformers import AutoTokenizer
import tiktoken
import pandas as pd
from transformers import GPT2TokenizerFast, PreTrainedTokenizerFast
import google.generativeai as genai
from config import api_key_google
import time

tokenizer_name = 'GPT4o'

match tokenizer_name:
    case "GPT4":
        tokenizer = tiktoken.encoding_for_model("gpt-4")
    case "GPT4o":
        tokenizer = tiktoken.encoding_for_model("gpt-4o")
    case "Claude":
        tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
    case "Gemini":
        genai.configure(api_key=api_key_google)
        model = genai.GenerativeModel(model_name="gemini-1.0-pro")
        tokenizer = model.count_tokens
    case "YaRN":
        tokenizer = AutoTokenizer.from_pretrained("NousResearch/Yarn-Llama-2-7b-128k")
    case "Claude_downloaded":
        tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizers/claude-v1-tokenization.json")
    case "Llama":
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
    case _:
        raise ValueError("Unknown tokenizer_name specified. Please provide a valid tokenizer_name.")

ks = [2, 8, 16, 32, 64]

for language in ['English', 'Somali', 'Swahili', 'Indonesian', 'Azeri', 'Vietnamese']:
    df_return = pd.DataFrame()
    df = pd.read_csv(f"datasets/Hastacks/mBBC_2024/bbc_{language}.csv")
    for k in ks:
        haystack = haystack_builder(df, k, tokenizer)

        for percentage in [0, 25, 50, 75]:
            parts = split_into_4_parts(haystack, tokenizer, percentage)
            city1, city2, city3 = "", "", ""
            while city1 == city2 or city1 == city3 or city2 == city3:
                needle1, city1, rnd_number1 = needles_builder(language)
                needle2, city2, rnd_number2 = needles_builder(language)
                needle3, city3, rnd_number3 = needles_builder(language)
            data = {
                "context_length": f"{k}k",
                "position": percentage,
                "text": parts[0] + ' ' + needle1 + ' ' + parts[1] + ' ' + needle2 + ' ' + parts[2] + ' ' + needle3 + ' ' + parts[3],
                "city1": city1,
                "label1": rnd_number1,
                "city2": city2,
                "label2": rnd_number2,
                "city3": city3,
                "label3": rnd_number3
            }
            df_return = pd.concat([df_return, pd.DataFrame([data])], ignore_index=True)
            df_return.to_csv(f"datasets/Haystack_Needles/Incongruous/3-Needles/{tokenizer_name}/{language}_needles.csv", index=False)

Total tokens: 1839
0 8
2 [0] [0, 1, 5, 5, 6, 7, 8]
8 18
2 [1, 4, 5, 5, 6, 7, 8] [8, 9, 15, 15, 16, 17, 18]
18 23
2 [1, 4, 10, 15, 15, 16, 17, 18] [18, 19, 22, 22, 23]
23 33
2 [1, 4, 15, 23, 22, 22, 23] [33]
Total tokens: 7887
0 36
8 [0] [0, 1, 22, 31, 34, 34, 35, 36]
36 73
8 [1, 4, 21, 31, 34, 34, 35, 36] [36, 37, 72, 72, 73]
73 123
8 [1, 4, 43, 73, 72, 72, 73] [73, 74, 109, 123, 122, 123, 122, 123]
123 166
8 [1, 4, 65, 99, 124, 123, 122, 123, 122, 123] [166]
Total tokens: 15891
0 74
16 [0] [0, 1, 45, 70, 70, 71, 72, 73, 74]
74 167
16 [1, 4, 43, 74, 73, 73, 74] [74, 75, 147, 159, 161, 162, 163, 164, 164, 165, 166, 167]
167 257
16 [1, 4, 87, 156, 161, 162, 163, 164, 164, 165, 166, 167] [167, 168, 250, 253, 254, 255, 255, 256, 257]
257 360
16 [1, 4, 130, 239, 247, 252, 254, 255, 255, 256, 257] [360]
Total tokens: 31889
0 168
32 [0] [0, 1, 91, 158, 162, 163, 164, 165, 166, 166, 167, 168]
168 360
32 [1, 4, 87, 156, 161, 163, 164, 165, 166, 166, 167, 168] [168, 169, 333, 367, 360, 358, 362,

# Calculate fertility score

In [None]:
import pandas as pd
import time
from nltk.tokenize import word_tokenize
from tabulate import tabulate
from transformers import GPT2TokenizerFast, AutoTokenizer, PreTrainedTokenizerFast

# Initialize a dictionary to store the results
results = {}

# Iterate over languages
for language in ['English', 'Somali', 'Swahili', 'Indonesian', 'Azeri', 'Vietnamese']:
    # Initialize a list to store results for this language
    language_results = []

    # Iterate over tokenizer names
    for tokenizer_name in ['GPT4', 'GPT4o', 'Llama']:
        match tokenizer_name:
            case "GPT4":
                tokenizer = tiktoken.encoding_for_model("gpt-4")
            case "GPT4o":
                tokenizer = tiktoken.encoding_for_model("gpt-4o")
            case "Claude":
                tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
            case "Gemini":
                genai.configure(api_key=api_key_google)
                model = genai.GenerativeModel(model_name="gemini-1.0-pro")
                tokenizer = model.count_tokens
            case "YaRN":
                tokenizer = AutoTokenizer.from_pretrained("NousResearch/Yarn-Llama-2-7b-128k")
            case "Claude_downloaded":
                tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizers/claude-v1-tokenization.json")
            case "Llama":
                tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
            case _:
                raise ValueError("Unknown tokenizer_name specified. Please provide a valid tokenizer_name.")

        # Process text and calculate fertility score
        df = pd.read_csv(f"datasets/Hastacks/mBBC_2024/bbc_{language}.csv")
        text = " ".join(df['body'][:50])
        try:
            total_tokens = len(tokenizer.tokenize(text))
        except AttributeError:
            try:
                total_tokens = len(tokenizer.encode(text))
            except AttributeError:
                total_tokens = tokenizer(text).total_tokens
                time.sleep(1)
        total_words = len(word_tokenize(text))
        fertility_score = total_tokens / total_words

        # Append fertility score to language_results list
        language_results.append(fertility_score)

    # Add language_results to the results dictionary with the language as key
    results[language] = language_results

# Print the results in a table format
tokenizers = ['GPT-4', 'GPT-4o', 'Llama-3']
print(tabulate(results.values(), headers=tokenizers, showindex=results.keys(), tablefmt="grid"))

+------------+---------+----------+-----------+
|            |   GPT-4 |   GPT-4o |   Llama-3 |
| English    | 1.13002 |  1.113   |   1.12959 |
+------------+---------+----------+-----------+
| Somali     | 2.37535 |  1.78619 |   2.35605 |
+------------+---------+----------+-----------+
| Swahili    | 2.23791 |  1.67568 |   2.21385 |
+------------+---------+----------+-----------+
| Indonesian | 1.92439 |  1.55291 |   1.91363 |
+------------+---------+----------+-----------+
| Azeri      | 3.37034 |  2.17708 |   3.10339 |
+------------+---------+----------+-----------+
| Vietnamese | 2.08932 |  1.2865  |   1.27364 |
+------------+---------+----------+-----------+


## Task 3: Find Articles

In [3]:
import pandas as pd
import math

for language in ['English']:
    df = pd.read_csv(f"datasets/Hastacks/mBBC_2024/bbc_{language}.csv")
    ## filter dataframe which the url has 'articles'
    df = df[df['url'].str.contains('articles')]
    
    # sort df by the length of the body
    df['body_length'] = df['body'].apply(lambda x: len(x.split()))
    df = df.sort_values(by='body_length', ascending=False, ignore_index=True)[:40]
    
    #shuffle the dataframe
    df = df.sample(frac=1).reset_index(drop=True)
    
    dataset = pd.DataFrame()
    for length in [10, 20, 30, 40]:
        concatenated_bodies = [f"Article {index+1}: {body}" for index, body in df['body'][:length].items()]
        concatenated_bodies = "\n".join(concatenated_bodies)
        for depth in [0, 25, 50, 75, 100]:
            # calculate the index of the article by ceiling division
            depth_index = 0 if depth ==0 else math.ceil(length * depth / 100)-1
            dictionary = {
                "context_length": f"{length}",
                "position": f"%{depth}",
                "text": concatenated_bodies,
                "title": df['title'][depth_index],
                "label": f"Article {depth_index+1}"
            }
            dataset = pd.concat([dataset, pd.DataFrame([dictionary])], ignore_index=True)
    # dataset.to_csv(f"datasets/Find_article/{language}.csv", index=False)

In [29]:
import pandas as pd
import re
import spacy

def count_cities(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    city_count = sum(1 for ent in doc.ents if ent.label_ == 'GPE')  # Count entities labeled as geopolitical entities (cities)
    return city_count

def count_numbers(text):
    numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', text.replace(",", ""))  # Remove commas temporarily
    return len(numbers)

for language in ['English', 'Somali', 'Swahili', 'Indonesian', 'Vietnamese']:
    df = pd.read_csv(f"datasets/Haystack_Needles/Incongruous/3-Needles/Gemini/{language}_needles.csv")
    text = list(df['text'])[-1]
    print(f"Number of numbers in {language} haystack: {count_numbers(text) - 3}")
    print(f"Number of geopolitical entities in {language} haystack: {count_cities(text) - 3}")

Number of numbers in English haystack: 711
Number of geopolitical entities in English haystack: 572
Number of numbers in Somali haystack: 148
Number of geopolitical entities in Somali haystack: 1054
Number of numbers in Swahili haystack: 419
Number of geopolitical entities in Swahili haystack: 1347
Number of numbers in Indonesian haystack: 664
Number of geopolitical entities in Indonesian haystack: 1786
Number of numbers in Vietnamese haystack: 636
Number of geopolitical entities in Vietnamese haystack: 790
