In [None]:
import openai
import json
import math
from dotenv import load_dotenv
import os
import datetime
# Ensure raw output directory exists
raw_output_dir = 'data/raw_output'
os.makedirs(raw_output_dir, exist_ok=True)

# Load environment variables from .env file
load_dotenv()

# Your OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Read the JSON file
with open('data/datadump/hobbiesforapi.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
hobbies = list(data['HobbiesCount'].keys())

# Combine all categories into one list, including non-hobby categories
all_categories = [
    "käsityöt", "yksilöurheilu", "kirjallisuus", "ryhmäurheilu", "musiikki", "valokuvaus",
    "kalastus", "metsästys", "vapaaehtoistoiminta", "pelit", "ei mainintaa", "työ/ammatti", "organisaatiot"
]

# Global data structures
categories = {category: [] for category in all_categories}
unmatched_hobbies = []

def load_data():
    global categories, unmatched_hobbies
    try:
        with open('data/categorized_hobbies.json', 'r', encoding='utf-8') as f:
            categories = json.load(f)
    except FileNotFoundError:
        print("No existing category data found. Initializing with empty categories.")

    try:
        with open('data/unmatched_hobbies.json', 'r', encoding='utf-8') as f:
            unmatched_hobbies = json.load(f)
    except FileNotFoundError:
        unmatched_hobbies = []
        print("No unmatched hobbies data found. Starting with an empty list.")

def save_data():
    with open('data/categorized_hobbies.json', 'w', encoding='utf-8') as f:
        json.dump(categories, f, ensure_ascii=False, indent=4)
    with open('data/unmatched_hobbies.json', 'w', encoding='utf-8') as f:
        json.dump(unmatched_hobbies, f, ensure_ascii=False, indent=4)
    print("Data saved to files.")

def save_hobby(hobby, category):
    global all_categories  # Ensure we are modifying the global variable
    # List of phrases that might indicate a new category
    new_category_indicators = ["new category, ", "uusi kategoria, ", "uusi kategori, "]

    # Check if the category input starts with any of the specified new category indicators
    new_category = None
    for indicator in new_category_indicators:
        if category.startswith(indicator):
            new_category = category.split(indicator)[1].strip()
            break

    if new_category:
        if new_category not in categories:
            categories[new_category] = []
            all_categories.append(new_category)  # Add to all_categories here
        categories[new_category].append(hobby)
    elif category in categories:
        categories[category].append(hobby)
    else:
        unmatched_hobbies.append(hobby)
    save_data()


def destroy_categories(category_names):
    global all_categories  # Ensure we are modifying the global variable
    # Load the current state of data
    load_data()
    
    for category_name in category_names:
        if category_name in categories:
            destroyed_hobbies = categories.pop(category_name)
            unmatched_hobbies.extend(destroyed_hobbies)
            all_categories = [cat for cat in all_categories if cat != category_name]  # Rebuild all_categories
            print(f"Category '{category_name}' has been destroyed. Its hobbies have been moved to unmatched hobbies.")
        else:
            print(f"Category '{category_name}' not found.")
    
    save_data()  # Save data only once after all categories have been processed



def recategorize_unmatched_hobbies():
    load_data()
    batch_size = 50
    num_batches = math.ceil(len(unmatched_hobbies) / batch_size)
    
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(unmatched_hobbies))
        hobbies_batch = unmatched_hobbies[batch_start:batch_end]
        
        print(f"Recategorizing batch {i + 1}/{num_batches}")
        response, original_hobbies = categorize_hobby_batch(hobbies_batch, from_unmatched=True)
        process_categorized_data(response, original_hobbies)
        
        print(f"Batch {i + 1} processed")

    save_data()  # Save data after all batches have been processed


def categorize_hobby_batch(hobbies_batch, from_unmatched=False):
    global all_categories 
    """Generate a prompt for categorizing a batch of hobbies, with special instructions for unmatched hobbies."""

    introduction = "You are a helpful assistant tasked with categorizing a list of hobbies. Here are the existing categories:\n"
    
    categories_list = "\n".join(all_categories)
    special_categories = """
    Special categories for handling noise in the data:
    - ei mainintaa: Use this category only if the entity is blank, says "not mentioned" or "ei mainintaa" or anything similar.
    - työ/ammatti: Use this category only if the entity clearly describes a job or profession and could not be counted as a hobby.
    - organisaatiot: Use this category only if the entity specifically names an organization. For example Marttayhdistys, or Karjalaisseura.
    """
    unmatched_instruction = """\n
    These hobbies are previously unmatched or from too specific categories, 
    please review carefully and be general in your sorting. 
    Note that new categories might have been created where these entities can fit.
    """

    new_category_instructions = """
    Categorize each hobby into one of the existing categories. 
    If a hobby does not fit into any of these categories, you can create a new category by stating: new category, [category name].
    Try to create general categories and avoid creating too specific categories.
    """
    format_instructions = "Format the response as: hobby: category"

    indexed_hobbies = [f"{i+1}. {hobby}" for i, hobby in enumerate(hobbies_batch)]
    hobbies_to_categorize = "\n".join(indexed_hobbies)

    full_prompt = (introduction + categories_list + special_categories + 
                   (unmatched_instruction if from_unmatched else "") + 
                   new_category_instructions + format_instructions + 
                   "\nCategorize the following hobbies:\n" + hobbies_to_categorize)

    print("Full prompt being sent to the API:")
    print(full_prompt)

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": full_prompt}]
    )

    # Get the current time
    now = datetime.datetime.now()
    date_time = now.strftime("%d%m%H%M")  # Format as day, month, hour, minute

    # Save the raw response to a file in the raw_output directory
    batch_index = hobbies_batch[0]  # Using the first hobby's index as the file identifier
    raw_output_path = os.path.join(raw_output_dir, f'raw_output_batch_{batch_index}_{date_time}.json')
    with open(raw_output_path, 'w', encoding='utf-8') as f:
        json.dump(response, f, ensure_ascii=False, indent=4)
    
    return response, hobbies_batch


def process_categorized_data(response, original_hobbies):
    new_unmatched_hobbies = []
    for choice in response['choices']:
        categorized_data = choice['message']['content']
        for line in categorized_data.split("\n"):
            if ": " in line:
                index, rest = line.split(". ", 1)
                hobby, category = rest.split(": ", 1)
                original_hobby = original_hobbies[int(index) - 1].strip()
                save_hobby(original_hobby, category.strip())
            else:
                new_unmatched_hobbies.append(original_hobbies[int(index) - 1].strip())
    unmatched_hobbies[:] = new_unmatched_hobbies  # Update unmatched list

def run_categorization():
    load_data()  # Ensure data is loaded at the beginning of the session
    batch_size = 50
    num_batches = math.ceil(len(hobbies) / batch_size)
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(hobbies))
        hobbies_batch = hobbies[batch_start:batch_end]
        print(f"Processing batch {i+1}/{num_batches}")
        response, original_hobbies = categorize_hobby_batch(hobbies_batch, from_unmatched=False)
        process_categorized_data(response, original_hobbies)
        print(f"Batch {i+1} processed")

load_data()  # Load all data at startup

In [None]:
import gc
import torch

# Delete variables
del model
del tokenizer
del generation_pipeline

# Collect garbage
gc.collect()

# Empty the cache
torch.cuda.empty_cache()

In [None]:
import json
import math
from dotenv import load_dotenv
import os
import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# Ensure raw output directory exists
raw_output_dir = 'data/raw_output'
os.makedirs(raw_output_dir, exist_ok=True)

# Load environment variables from .env file
load_dotenv()


model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
print(model_id)
shared_dir = "/scratch/project_462000642/joonatan/shared_models"
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=shared_dir)

# Load the model with Flash Attention 2
with torch.device("cuda"):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        cache_dir=shared_dir,
        torch_dtype=torch.float16,
        use_flash_attention_2=True,
        device_map="auto"
    )
# Initialize the text generation pipeline
generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

In [None]:
def generate_text(full_prompt):
    # Configuration settings for different generation scenarios
    configs = [
        {"do_sample": False, "num_beams": 1, "temperature": 0.3, "top_k": 50, "top_p": 0.1, "max_new_tokens": 1500},
        {"do_sample": True, "num_beams": 1, "temperature": 0.3, "top_k": 50, "top_p": 0.1, "max_new_tokens": 1500},
        {"do_sample": True, "num_beams": 1, "temperature": 0.3, "top_k": 50, "top_p": 0.8, "max_new_tokens": 1500},
    ]

    # Select configuration - example using the first configuration
    config = configs[0]

    # Prepare messages in the required format for the model
    messages = [
        {"role": "system", "content": "You are an assistant who categorizes hobbies."},
        {"role": "user", "content": full_prompt}
    ]

    # Generate text using the selected configuration
    outputs = generation_pipeline(messages, 
                                  max_new_tokens=config['max_new_tokens'], 
                                  num_beams=config['num_beams'], 
                                  do_sample=config['do_sample'],
                                  temperature=config['temperature'], 
                                  top_k=config['top_k'], 
                                  top_p=config['top_p'])

    # Assuming the outputs contain the assistant's response, directly returned by the model
    # Adjust according to the model's actual output format.
    assistant_response = outputs[0]["generated_text"][-1]

#    print(outputs)  # Optional: Print output for debugging
    return assistant_response

# Example usage
full_prompt = "I enjoy swimming, reading, and playing chess. How would you categorize these hobbies?"
assistant_reply = generate_text(full_prompt)
print(assistant_reply)

In [None]:
def generate_text(full_prompt):
    # Predefined response string
    predefined_response = """1. käsityöt: käsityöt 
    2. kalastus: kalastus 
    3. kirjallisuus: kirjallisuus 
    4. puutarhanhoito: uusi kategoria, puutarha 
    5. lukeminen: kirjallisuus 
    6. metsästys: metsästys 
    7. hiihto: yksilöurheilu 
    8. ulkoilu: uusi kategoria, luonnon havainnointi 
    9. urheilu: ryhmäurheilu 
    10. käsitöitä: käsityöt 
    11. retkeily: uusi kategoria, luonnon havainnointi 
    12. musiikki: musiikki 
    13. voimistelu: yksilöurheilu 
    14. matkailu: uusi kategoria, matkailu 
    15. kuorolaulu: musiikki 
    16. kodinhoito: tässä tarvitaan uusi uusi kategoria, kodinhoito 
    17. käsitöiden tekeminen: käsityöt 
    18. laulu: musiikki 
    19. uinti: yksilöurheilu 
    20. autoilu: uusi kategoria, autoilu 
    21. karjanhoito: tämä ei sovi minnekkään joten uusi kategoria, karjanhoito 
    22. puutarhatyöt: puutarha 
    """
    return {"content": predefined_response}

# Example usage
full_prompt = "I enjoy swimming, reading, and playing chess. How would you categorize these hobbies?"
assistant_reply = generate_text(full_prompt)
print(assistant_reply["content"])

In [None]:
import json
import math
from dotenv import load_dotenv
import os
import datetime

# Ensure raw output directory exists
raw_output_dir = 'data/tests/raw_output'
os.makedirs(raw_output_dir, exist_ok=True)

# Load environment variables from .env file
load_dotenv()

# Read the JSON file
with open('data/tests/hobbiesforapi.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
hobbies = list(data['HobbiesCount'].keys())

# Global variables
all_categories = [
    "käsityöt", "yksilöurheilu", "kirjallisuus", "ryhmäurheilu", "musiikki", "valokuvaus",
    "kalastus", "metsästys", "vapaaehtoistoiminta", "pelit", "ei mainintaa", "työ/ammatti", "organisaatiot"
]
categories = {}
unmatched_hobbies = []

def load_data():
    global categories, unmatched_hobbies, all_categories
    
    print("Starting data loading process...")
    
    # Load categories
    try:
        with open('data/tests/categorized_hobbies_testing.json', 'r', encoding='utf-8') as f:
            categories = json.load(f)
        print(f"Successfully loaded categorized hobbies from file.")
        print(f"Number of categories: {len(categories)}")
        
        # Update all_categories with loaded categories
        all_categories = list(set(all_categories + list(categories.keys())))
        
        for category, hobbies in categories.items():
            print(f"  - {category}: {len(hobbies)} hobbies")
        print(f"Total categorized hobbies: {sum(len(hobbies) for hobbies in categories.values())}")
    except FileNotFoundError:
        print("No existing category data found. Initializing with empty categories.")
        categories = {category: [] for category in all_categories}
        print(f"Initialized {len(categories)} empty categories.")
    except json.JSONDecodeError:
        print("Error decoding categorized hobbies JSON. File may be corrupted.")
        categories = {category: [] for category in all_categories}
    
    # Load unmatched hobbies
    try:
        with open('data/tests/unmatched_hobbies_testing.json', 'r', encoding='utf-8') as f:
            unmatched_hobbies = json.load(f)
        print(f"Successfully loaded unmatched hobbies from file.")
        print(f"Number of unmatched hobbies: {len(unmatched_hobbies)}")
        if unmatched_hobbies:
            print("First 5 unmatched hobbies:")
            for hobby in unmatched_hobbies[:5]:
                print(f"  - {hobby}")
            if len(unmatched_hobbies) > 5:
                print(f"  ... and {len(unmatched_hobbies) - 5} more")
    except FileNotFoundError:
        unmatched_hobbies = []
        print("No unmatched hobbies data found. Starting with an empty list.")
    except json.JSONDecodeError:
        print("Error decoding unmatched hobbies JSON. File may be corrupted.")
        unmatched_hobbies = []
    
    print("\nData loading summary:")
    print(f"Total categories: {len(all_categories)}")
    print(f"Categories: {', '.join(all_categories)}")
    print(f"Total categorized hobbies: {sum(len(hobbies) for hobbies in categories.values())}")
    print(f"Total unmatched hobbies: {len(unmatched_hobbies)}")
    print("Data loading process complete.")

def save_data():
    with open('data/tests/categorized_hobbies_testing.json', 'w', encoding='utf-8') as f:
        json.dump(categories, f, ensure_ascii=False, indent=4)
    with open('data/tests/unmatched_hobbies_testing.json', 'w', encoding='utf-8') as f:
        json.dump(unmatched_hobbies, f, ensure_ascii=False, indent=4)
    print("Data saved to files.")

def save_hobby(hobby, category_info):
    global all_categories, categories, unmatched_hobbies
    
    # List of phrases that indicate a new category
    new_category_indicators = ["uusi kategoria,", "new category,"]
    
    # Remove any leading/trailing whitespace
    category_info = category_info.strip()
    
    # Check if this is a new category
    is_new_category = any(indicator in category_info.lower() for indicator in new_category_indicators)
    
    if is_new_category:
        # Extract the new category name
        for indicator in new_category_indicators:
            if indicator in category_info.lower():
                _, category = category_info.lower().split(indicator, 1)
                category = category.strip()
                break
        
        # Create the new category if it doesn't exist
        if category not in categories:
            categories[category] = []
            all_categories.append(category)
    else:
        # Use the provided category directly
        category = category_info
        
        # If the category doesn't exist, add it to unmatched_hobbies
        if category not in categories:
            unmatched_hobbies.append(hobby)
            return None  # Return None to indicate the hobby wasn't categorized
    
    # Add the hobby to the appropriate category
    categories[category].append(hobby)
    
    # Remove the hobby from unmatched_hobbies if it's there
    if hobby in unmatched_hobbies:
        unmatched_hobbies.remove(hobby)
    
    return category  # Return the category name

def destroy_categories(category_names):
    global all_categories  # Ensure we are modifying the global variable
    # Load the current state of data
    load_data()
    
    for category_name in category_names:
        if category_name in categories:
            destroyed_hobbies = categories.pop(category_name)
            unmatched_hobbies.extend(destroyed_hobbies)
            all_categories = [cat for cat in all_categories if cat != category_name]  # Rebuild all_categories
            print(f"Category '{category_name}' has been destroyed. Its hobbies have been moved to unmatched hobbies.")
        else:
            print(f"Category '{category_name}' not found.")
    
    save_data()  # Save data only once after all categories have been processed



def categorize_hobby_batch(hobbies_batch, from_unmatched=False):
    global all_categories 

    introduction = "Olet avulias assistentti, jonka tehtävänä on luokitella lista harrastuksia annettuihin kategorioihin."

    # List of hobbies to categorize
    indexed_hobbies = [f"{i+1}. {hobby}" for i, hobby in enumerate(hobbies_batch)]
    hobbies_to_categorize = "Luokiteltavat harrastukset:\n" + "\n".join(indexed_hobbies)

    # List of categories
    categories_list = "Käytettävissä olevat kategoriat:\n" + "\n".join(all_categories)

    special_categories = """
    Erityiskategoriat datan kohinan käsittelyyn:
    - ei mainintaa: Käytä tätä kategoriaa vain, jos kohde on tyhjä, siinä lukee "ei mainintaa" tai vastaavaa.
    - työ/ammatti: Käytä tätä kategoriaa vain, jos kohde selvästi kuvaa työtä tai ammattia eikä sitä voida laskea harrastukseksi.
    - organisaatiot: Käytä tätä kategoriaa vain, jos kohde nimeää erityisesti jonkin organisaation. Esimerkiksi Marttayhdistys tai Karjalaisseura.
    """

    format_instructions = """
    TÄRKEÄÄ: Noudata tarkasti seuraavaa muotoilua vastauksessasi:
    indeksi. harrastus: kategoria

    Jos luot uuden kategorian, käytä muotoa:
    indeksi. harrastus: uusi kategoria, kategorian_nimi

    Jokainen vastaus TÄYTYY olla omalla rivillään.
    ÄLÄ lisää selityksiä tai ylimääräistä tekstiä vastauksiin.
    Käytä vain annettuja kategorioita tai luo uusi kategoria tarvittaessa.

    Esimerkkejä oikeasta muotoilusta:
    1. uiminen: yksilöurheilu
    2. lintu bongaus: uusi kategoria, luonnon havainnointi
    3. ompelu: käsityöt
    """

    new_category_instructions = """
    Jos harrastus ei sovi mihinkään olemassa olevaan kategoriaan, voit luoda uuden kategorian seuraavien ohjeiden mukaisesti:
    1. Käytä tarkkaa ilmausta "uusi kategoria, " (pilkku ja välilyönti mukaan lukien) ja sen jälkeen uuden kategorian nimi.
    2. Uuden kategorian nimen tulee olla yleisluontoinen eikä liian tarkka.
    3. Luotuasi uuden kategorian, käytä sitä nykyiselle harrastukselle ja kaikille seuraaville harrastuksille, jotka sopivat siihen.
    4. Älä luo alakategorioita tai käytä kaksoispisteitä uusien kategorioiden nimissä.
    """

    unmatched_instruction = """\n
    Nämä harrastukset ovat aiemmin luokittelemattomia tai liian tarkasti määritellyistä kategorioista, 
    tarkastele ne huolellisesti ja ole yleisluontoinen lajittelussasi. 
    Huomaa, että uusia kategorioita on saatettu luoda, joihin nämä kohteet voivat sopia.
    """ if from_unmatched else ""

    full_prompt = (
        introduction + "\n\n" +
        hobbies_to_categorize + "\n\n" +
        special_categories + "\n\n" +
        categories_list + "\n\n" +
        new_category_instructions + "\n\n" +
        unmatched_instruction + "\n\n" +
        format_instructions + "\n\n" +
        "Luokittele nyt annetut harrastukset:"
    )

    response_text = generate_text(full_prompt)

    # Get the current time
    now = datetime.datetime.now()
    date_time = now.strftime("%d%m%H%M")  # Format as day, month, hour, minute

    # Save the raw response to a file in the raw_output directory
    batch_index = hobbies_batch[0]  # Using the first hobby's index as the file identifier
    raw_output_path = os.path.join(raw_output_dir, f'raw_output_batch_{batch_index}_{date_time}.json')
    with open(raw_output_path, 'w', encoding='utf-8') as f:
        json.dump(response_text, f, ensure_ascii=False, indent=4)
    
    return response_text, hobbies_batch

def process_categorized_data(response, original_hobbies):
    newly_categorized = []
    categorized_data = response.get('content', '')
    lines = categorized_data.split("\n")
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        try:
            if ": " in line and ". " in line:
                index_part, category_info = line.split(". ", 1)
                index = int(index_part) - 1
                
                if 0 <= index < len(original_hobbies):
                    original_hobby = original_hobbies[index].strip()
                    category = save_hobby(original_hobby, category_info.split(": ", 1)[1])
                    if category is not None:
                        newly_categorized.append(original_hobby)
                else:
                    print(f"Index out of range: {index}")
            else:
                print(f"Skipping line due to incorrect format: {line}")
        except (ValueError, IndexError) as e:
            print(f"Error processing line: {line}")
            print(f"Exception: {e}")

    save_data()  # Save categorized data after processing the batch
    return newly_categorized


def recategorize_unmatched_hobbies():
    load_data()
    print(f"Loaded {len(unmatched_hobbies)} unmatched hobbies")
    
    # Create a copy of unmatched hobbies to process
    hobbies_to_process = unmatched_hobbies.copy()
    
    batch_size = 50
    num_batches = math.ceil(len(hobbies_to_process) / batch_size)
    
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(hobbies_to_process))
        hobbies_batch = hobbies_to_process[batch_start:batch_end]
        
        print(f"Processing batch {i+1}/{num_batches}")
        
        if not hobbies_batch:
            print("Empty batch encountered. Stopping process.")
            break
        
        try:
            response, original_hobbies = categorize_hobby_batch(hobbies_batch, from_unmatched=True)
            newly_categorized = process_categorized_data(response, original_hobbies)
            
            # Remove categorized hobbies from the unmatched_hobbies list
            unmatched_hobbies[:] = [h for h in unmatched_hobbies if h not in newly_categorized]
            
            # Save updated unmatched_hobbies to JSON after each batch
            save_unmatched_hobbies()
            
            print(f"Batch {i+1} processed. Categorized {len(newly_categorized)} hobbies.")
        except Exception as e:
            print(f"Error processing batch {i+1}: {str(e)}")
        
        print(f"Remaining unmatched hobbies: {len(unmatched_hobbies)}")
    
    print("Recategorization process complete")
    print(f"Final unmatched hobbies count: {len(unmatched_hobbies)}")

def save_unmatched_hobbies():
    with open('data/tests/unmatched_hobbies_testing.json', 'w', encoding='utf-8') as f:
        json.dump(unmatched_hobbies, f, ensure_ascii=False, indent=4)
    print("Unmatched hobbies saved to file.")

def run_categorization():
    load_data()  # Ensure data is loaded at the beginning of the session
    batch_size = 300
    num_batches = math.ceil(len(hobbies) / batch_size)
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(hobbies))
        hobbies_batch = hobbies[batch_start:batch_end]
        print(f"Processing batch {i+1}/{num_batches}")
        response, original_hobbies = categorize_hobby_batch(hobbies_batch, from_unmatched=False)
        process_categorized_data(response, original_hobbies)
        print(f"Batch {i+1} processed")

load_data()

In [None]:
run_categorization()

In [None]:
recategorize_unmatched_hobbies()

In [None]:
destroy_categories(['yleisurheilu','marttatyö'])

In [None]:
#TEST

import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def get_all_categorized_hobbies(categorized_hobbies):
    return set(hobby for category in categorized_hobbies.values() for hobby in category)

def main():
    # Load the original hobbies
    data = load_json('data/combinedHowManyHobbies2.json')
    original_hobbies = set(data['HobbiesCount'].keys())

    # Load the categorized hobbies
    categorized_hobbies = load_json('data/categorized_hobbies_flash.json')
    all_categorized_hobbies = get_all_categorized_hobbies(categorized_hobbies)

    # Find unmatched hobbies
    unmatched_hobbies = list(original_hobbies - all_categorized_hobbies)

    # Save unmatched hobbies directly to JSON root
    save_json(unmatched_hobbies, 'data/unmatched_hobbies_flash.json')

    print(f"Found {len(unmatched_hobbies)} unmatched hobbies. Saved to data/unmatched_hobbies_testing.json")

if __name__ == "__main__":
    main()

In [None]:
# Path to the JSON file
file_path = 'data/categorized_hobbies_blind.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Print all category names
for category in data.keys():
    print(category)


In [None]:
import json
from collections import Counter
from itertools import combinations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def analyze_categories(files):
    all_categories = set()
    category_counts = Counter()
    file_data = {}

    for file in files:
        data = load_json(file)
        categories = set(data.keys())
        all_categories.update(categories)
        category_counts.update(categories)
        file_data[file] = data

    # Convert set to sorted list for consistent ordering
    all_categories_list = sorted(all_categories)

    df = pd.DataFrame(index=all_categories_list, columns=files)
    for file in files:
        df[file] = df.index.isin(file_data[file].keys()).astype(int)

    df['total'] = df.sum(axis=1)
    df = df.sort_values('total', ascending=False)

    plt.figure(figsize=(15, 10))
    sns.heatmap(df[files], cmap='YlOrRd', cbar_kws={'label': 'Present in file'})
    plt.title('Category Presence Across Files')
    plt.tight_layout()
    plt.savefig('category_heatmap.png')
    plt.close()

    print("Category presence across files saved as 'category_heatmap.png'")
    print("\nCategory frequency:")
    print(df['total'].to_string())


    # Calculate percentage of files each category appears in
    df['percentage'] = (df['total'] / len(files)) * 100
    
    print("\nCategories present in all files:")
    print(df[df['percentage'] == 100].index.tolist())

    print("\nCategories present in only one file:")
    print(df[df['total'] == 1].index.tolist())

    # Visualize category distribution
    plt.figure(figsize=(15, 10))
    df['percentage'].hist(bins=len(files))
    plt.title('Distribution of Category Presence Across Files')
    plt.xlabel('Percentage of Files')
    plt.ylabel('Number of Categories')
    plt.savefig('category_distribution.png')
    plt.close()


def analyze_entity_cooccurrence(files):
    all_entities = set()
    entity_categories = {}
    
    for file in files:
        data = load_json(file)
        for category, entities in data.items():
            all_entities.update(entities)
            for entity in entities:
                if entity not in entity_categories:
                    entity_categories[entity] = set()
                entity_categories[entity].add(category)

    # Convert all entities to lowercase to avoid warnings
    all_entities = {entity.lower() for entity in all_entities}

    vectorizer = CountVectorizer(vocabulary=all_entities, lowercase=True)
    entity_vectors = []

    for file in files:
        data = load_json(file)
        doc = ' '.join([' '.join(entities).lower() for entities in data.values()])
        entity_vectors.append(vectorizer.fit_transform([doc]).toarray()[0])

    try:
        # Use get_feature_names_out() for newer scikit-learn versions
        feature_names = vectorizer.get_feature_names_out()
    except AttributeError:
        # Fallback for older versions
        feature_names = vectorizer.get_feature_names()

    entity_matrix = pd.DataFrame(entity_vectors, columns=feature_names, index=files)
    
    # Clustering
    kmeans = KMeans(n_clusters=min(10, len(entity_matrix.columns)), random_state=42)
    clusters = kmeans.fit_predict(entity_matrix.T)
    
    # Dimensionality reduction for visualization
    tsne = TSNE(n_components=2, random_state=42)
    entity_2d = tsne.fit_transform(entity_matrix.T)

    plt.figure(figsize=(15, 10))
    scatter = plt.scatter(entity_2d[:, 0], entity_2d[:, 1], c=clusters, cmap='viridis', alpha=0.7)
    plt.colorbar(scatter, label='Cluster')
    plt.title('Entity Clustering')
    plt.tight_layout()
    plt.savefig('entity_clusters.png')
    plt.close()

    print("Entity clustering visualization saved as 'entity_clusters.png'")

    # Find differing clusters
    cluster_diffs = {}
    for i, file1 in enumerate(files):
        for j, file2 in enumerate(files[i+1:], start=i+1):
            diff_entities = set(entity_matrix.columns[entity_matrix.iloc[i] != entity_matrix.iloc[j]])
            if diff_entities:
                cluster_diffs[(file1, file2)] = diff_entities

    print("\nDiffering entities between files:")
    for (file1, file2), diff_entities in cluster_diffs.items():
        print(f"\n{file1} vs {file2}:")
        print(", ".join(sorted(diff_entities)[:20]))  # Print first 20 differing entities

    # Additional analysis: Most common entities across all files
    entity_sums = entity_matrix.sum()
    print("\nTop 20 most common entities across all files:")
    print(entity_sums.nlargest(20).to_string())

    # Visualize entity frequency distribution
    plt.figure(figsize=(15, 10))
    entity_sums.hist(bins=50)
    plt.title('Distribution of Entity Frequency Across Files')
    plt.xlabel('Frequency')
    plt.ylabel('Number of Entities')
    plt.savefig('entity_frequency_distribution.png')
    plt.close()

    print("Entity frequency distribution visualization saved as 'entity_frequency_distribution.png'")

    # Find differing clusters
    cluster_diffs = {}
    for i, file1 in enumerate(files):
        for j, file2 in enumerate(files[i+1:], start=i+1):
            diff_entities = set(entity_matrix.iloc[i].index[entity_matrix.iloc[i] != entity_matrix.iloc[j]])
            if diff_entities:
                cluster_diffs[(file1, file2)] = diff_entities

    print("\nDiffering entities between files:")
    for (file1, file2), diff_entities in cluster_diffs.items():
        print(f"\n{file1} vs {file2}:")
        print(", ".join(sorted(diff_entities)[:20]))  # Print first 20 differing entities

if __name__ == "__main__":
    files = [
        'data/categorized_hobbies_blind_completed1.json',
        'data/categorized_hobbies_finnish_completed1.json',
        'data/categorized_hobbies_completed1.json',
        'data/categorized_hobbies_flash_1_completed.json'
    ]
    
    print("Analyzing categories...")
    analyze_categories(files)
    
    print("\nAnalyzing entity co-occurrence and clustering...")
    analyze_entity_cooccurrence(files)