In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer
import os
import csv



def login_to_huggingface(api_token):
    login(api_token)


# Function to initialize the tokenizer and get vocabulary details
def initialize_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    vocab_list = list(tokenizer.get_vocab().keys())
    vocab_size = len(vocab_list)
    print(f"Model: {model_name}, Vocabulary Size: {vocab_size}")
    return vocab_list, vocab_size



def extract_vocab_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file.readlines())



def process_language_files(directory, vocab_list, vocab_size, model_name):
    results = []

    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            vocab = extract_vocab_from_file(file_path)
            unicode_size = len(vocab)

            # Calculate matching and non-matching characters
            matching_chars = vocab.intersection(vocab_list)
            non_matching_chars = vocab.difference(vocab_list)


            language_name = filename.split('.')[0]
            results.append({
                'modelname': model_name,
                'language_name': language_name,
                'unicode_size': unicode_size,
                'matching_chars': len(matching_chars),
                'non_matching_chars': len(non_matching_chars),
                'vocab_size': vocab_size,
            })

    return results



def export_to_csv(results, output_file):
    header = ['modelname', 'language_name', 'unicode_size', 'matching_chars', 'non_matching_chars', 'vocab_size']

    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        writer.writeheader()
        writer.writerows(results)
    print(f"Results have been exported to {output_file}")


def get_output_csv_filename(model_name):
    model_short_name = model_name.split('/')[-1]
    return f"{model_short_name}_vocab_results.csv"


# Main pipeline
def main():
    # Hugging Face API token and model name
    api_token = ""
    model_name = "numind/NuExtract-1.5"


    login_to_huggingface(api_token)     # Login to Hugging Face

    vocab_list, vocab_size = initialize_tokenizer(model_name)


    directory_path = "/content/drive/MyDrive/Vocab"       # Directory containing language unicode files
    output_csv_file = "NuExtract-1.5_vocab_results.csv"
    output_csv_file = get_output_csv_filename(model_name)


    results = process_language_files(directory_path, vocab_list, vocab_size, model_name)
    export_to_csv(results, output_csv_file)


if __name__ == "__main__":
    main()
