In [16]:
from transformers import AutoTokenizer
import json
import os
#tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl") #BPE tokenizer
#tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") #WordPiece tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") #Unigram tokenizer

real_path = "./distribution/real/part_"
tokenized_path = "./distribution/tokenized/part_"

def walk_directory(directory):
    l = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                l.append(os.path.join(root, file))
    return l
#divide l into 10 equal parts
def divide_into_parts(l, n):
    return [l[i::n] for i in range(n)]

In [17]:
def create_article_list(file):
    #print(f"Creating article list from {file}")
    articles = []
        # Load the JSON file
    with open(file, "r") as fl:
        f = json.load(fl)
        articles.append(f["original_doc"])
        articles.append(f['articles']['gpt4o']['article'])
        articles.append(f['articles']['claude3.5sonnet']['article'])
        articles.append(f['articles']['llama3.1-405b']['article'])
        articles.append(f['articles']['qwen1.5-110b']['article'])
    #print(f"Found {len(articles)} articles in {file}")
    return articles

def conservative_tokenize(path):
    token_distribution = {}
    a = path.split('/real')
    b = a[0] + "/tokenized" + a[1]
    #on the first pass, tokenize the file. Save the tokenized file in another directory with a .tok.json extension. If the file already exists, load it instead.
    output_path = b.replace(".json", ".tok.json")
    if os.path.exists(output_path):
        print(f"Loading existing token distribution from {output_path}")
        # Load the existing token distribution
        with open(output_path, "r") as f:
            token_distribution = json.load(f)
    else:
        print(f"Tokenizing {path} and saving to {output_path}")
        # Create the directory if it doesn't exist
        for article in create_article_list(path):
            # Tokenize the article
            #print(f"Tokenizing article: {article[:50]}...")
            article = article.strip()
            article = article.lower()
            data = tokenizer(article, return_tensors="pt", truncation=True, max_length=512)#tokenizer.model_max_length)
        for token_id in data["input_ids"][0]:
                token = tokenizer.decode(token_id.item())
                if token not in token_distribution:
                    token_distribution[token] = 0
                token_distribution[token] += 1
        with open(output_path, "w") as fd:
            json.dump(token_distribution, fd, indent=4)
    return token_distribution


In [None]:

def create_distribution(piecelist):
    final_token_distribution = {}
    for file in piecelist:
        print(f"Processing file: {file}")
        token_distribution = conservative_tokenize(file)
        # Merge the token distributions
        for token, count in token_distribution.items():
            if token not in final_token_distribution:
                final_token_distribution[token] = 0
            final_token_distribution[token] += count
    sorted_distribution = dict(sorted(final_token_distribution.items(), key=lambda item: item[1], reverse=True))
    output_file = "./json_files/token_distribution.json"
    #if the file already exists, update it
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            existing_distribution = json.load(f)
        for token, count in sorted_distribution.items():
            if token in existing_distribution:
                existing_distribution[token] += count
            else:
                existing_distribution[token] = count
        sorted_distribution = dict(sorted(existing_distribution.items(), key=lambda item: item[1], reverse=True))
    os.remove(output_file) if os.path.exists(output_file) else None
    # Save the final token distribution to a JSON file
    with open(output_file, "w") as f:
        json.dump(sorted_distribution, f, indent=4)

In [19]:
'''jsonfiles = walk_directory("./farad")
parts = divide_into_parts(jsonfiles, 10)
#move each part into its separate folder
for i, part in enumerate(parts):
    os.makedirs(f"./distribution/part_{i}", exist_ok=True)
    #put all files in the part into the folder
    for file in part:
        os.rename(file, f"./distribution/part_{i}/{os.path.basename(file)}")'''

'jsonfiles = walk_directory("./farad")\nparts = divide_into_parts(jsonfiles, 10)\n#move each part into its separate folder\nfor i, part in enumerate(parts):\n    os.makedirs(f"./distribution/part_{i}", exist_ok=True)\n    #put all files in the part into the folder\n    for file in part:\n        os.rename(file, f"./distribution/part_{i}/{os.path.basename(file)}")'

In [20]:
#create a cycle to process each part

for i in range(0,10):
    piece = walk_directory(f"{real_path}{i}")
    token_distribution = create_distribution(piece)

Processing file: ./distribution/real/part_0\0000.json
Tokenizing ./distribution/real/part_0\0000.json and saving to ./distribution/tokenized/part_0\0000.tok.json
Processing file: ./distribution/real/part_0\0010.json
Tokenizing ./distribution/real/part_0\0010.json and saving to ./distribution/tokenized/part_0\0010.tok.json
Processing file: ./distribution/real/part_0\0020.json
Tokenizing ./distribution/real/part_0\0020.json and saving to ./distribution/tokenized/part_0\0020.tok.json
Processing file: ./distribution/real/part_0\0030.json
Tokenizing ./distribution/real/part_0\0030.json and saving to ./distribution/tokenized/part_0\0030.tok.json
Processing file: ./distribution/real/part_0\0040.json
Tokenizing ./distribution/real/part_0\0040.json and saving to ./distribution/tokenized/part_0\0040.tok.json
Processing file: ./distribution/real/part_0\0050.json
Tokenizing ./distribution/real/part_0\0050.json and saving to ./distribution/tokenized/part_0\0050.tok.json
Processing file: ./distribut

In [21]:
'''#delete all .tok.json files in the distribution folder
for file in walk_directory("./distribution"):
    if file.endswith(".tok.json"):
        os.remove(file)
        print(f"Removed {file}")'''

'#delete all .tok.json files in the distribution folder\nfor file in walk_directory("./distribution"):\n    if file.endswith(".tok.json"):\n        os.remove(file)\n        print(f"Removed {file}")'