# Convert Disinformer CSV to JSON

This notebook converts the `disinformer_full_games_clues.csv` file to JSON format following the predefined schema:

```
GeneralGames (collection)
 └── LanguageCode_GameTopic (document)
      └── games (Array)
            ├── gameName (string) = round1_Answer - round2_Answer 
            ├── id (id)
            ├── rounds (Array)
                  ├── answer (string)
                  ├── choices (Array)
                  └── informed_clues (Array)
                  └── misinformed_clues (Array)
                  └── fake_clues (Array)
                  └── extra_clue (String)
```

## 1. Import Required Libraries

In [1]:
import pandas as pd
import json
from pathlib import Path
from typing import Dict, List
from enum import Enum

## 2. Define Language Codes Enum

In [2]:
class LanguageCode(Enum):
    """Supported language codes for the Disinformer game."""
    EN = "EN"  # English
    RU = "RU"  # Russian
    FR = "FR"  # French
    CN = "CN"  # Chinese
    AR = "AR"  # Arabic
    
    def __str__(self):
        return self.value

## 3. Configuration - Select Language Code

**Change the `SELECTED_LANG` variable to generate JSON for different languages.**

In [None]:
# ===== CONFIGURATION =====
# Change this to generate JSON for different language codes
SELECTED_LANG = LanguageCode.EN
# SELECTED_LANG = LanguageCode.RU
# SELECTED_LANG = LanguageCode.FR
# SELECTED_LANG = LanguageCode.CN
# SELECTED_LANG = LanguageCode.AR

# Paths
ROOT_DIR = Path.cwd()
CSV_PATH = ROOT_DIR / str(SELECTED_LANG) / "disinformer_full_games_clues.csv"
OUTPUT_DIR = ROOT_DIR / str(SELECTED_LANG)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Selected Language: {SELECTED_LANG.value}")
print(f"CSV Path: {CSV_PATH}")
print(f"Output Directory: {OUTPUT_DIR}")
print(f"CSV exists: {CSV_PATH.exists()}")

Selected Language: EN
CSV Path: d:\MOSAIC\disinformer-clue-gen\EN\disinformer_full_games_clues.csv
Output Directory: d:\MOSAIC\disinformer-clue-gen\EN
CSV exists: True


## 4. Load CSV Data

In [4]:
# Read CSV file
df = pd.read_csv(CSV_PATH)

# Display basic info
print(f"Total rows: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nTopic categories: {sorted(df['topic_category'].unique())}")
print(f"Total test runs: {df['test_run'].nunique()}")
print(f"\nFirst few rows:")
df.head()

Total rows: 3000
Columns: ['test_run', 'topic_category', 'round', 'answer', 'choices', 'clue_type', 'clue_number', 'clue_text', 'word_count', 'length_ok', 'manual_score / comment']

Topic categories: ['Books', 'Broadcast Media', 'Food', 'Inventions', 'Nature', 'Places', 'Songs', 'Sports', 'Technology', 'Video Games']
Total test runs: 10

First few rows:


Unnamed: 0,test_run,topic_category,round,answer,choices,clue_type,clue_number,clue_text,word_count,length_ok,manual_score / comment
0,1,Books,1,Fantasy,"Fantasy, Sci-Fi, Adventure",informed,1,"This genre often features magic, mythical crea...",16,YES,
1,1,Books,1,Fantasy,"Fantasy, Sci-Fi, Adventure",informed,2,"It typically involves quests, battles against ...",18,YES,
2,1,Books,1,Fantasy,"Fantasy, Sci-Fi, Adventure",informed,3,The narrative often includes characters with s...,15,YES,
3,1,Books,1,Fantasy,"Fantasy, Sci-Fi, Adventure",informed,4,"These narratives often feature heroes, their j...",20,YES,
4,1,Books,1,Fantasy,"Fantasy, Sci-Fi, Adventure",informed,5,Readers are often transported to realms where ...,17,YES,


## 5. Helper Functions

In [5]:
def parse_choices(choices_str: str) -> List[str]:
    """Parse comma-separated choices string into a list."""
    if pd.isna(choices_str):
        return []
    return [choice.strip() for choice in choices_str.split(',')]

def generate_game_id(test_run: int, topic: str) -> str:
    """Generate a unique game ID."""
    return f"{topic}_{test_run:03d}"

def generate_game_name(round1_answer: str, round2_answer: str) -> str:
    """Generate game name from round answers."""
    return f"{round1_answer} - {round2_answer}"

## 6. Process Data and Convert to JSON Structure

In [6]:
def convert_csv_to_json(df: pd.DataFrame, lang_code: LanguageCode) -> Dict:
    """
    Convert CSV data to JSON structure following the schema.
    
    Returns a dictionary where keys are LanguageCode_GameTopic.
    """
    result = {}
    
    # Group by topic category and test run
    for topic in df['topic_category'].unique():
        topic_key = f"{lang_code.value}_{topic.replace(' ', '_')}"
        result[topic_key] = {"games": []}
        
        topic_df = df[df['topic_category'] == topic]
        
        # Group by test run to create individual games
        for test_run in sorted(topic_df['test_run'].unique()):
            game_df = topic_df[topic_df['test_run'] == test_run]
            
            # Get round data
            rounds_data = []
            
            for round_num in sorted(game_df['round'].unique()):
                round_df = game_df[game_df['round'] == round_num]
                
                if len(round_df) == 0:
                    continue
                
                # Get answer and choices from first row of this round
                first_row = round_df.iloc[0]
                answer = first_row['answer']
                choices = parse_choices(first_row['choices'])
                
                # Collect clues by type
                informed_clues = []
                misinformed_clues = []
                fake_clues = []
                extra_clue = ""
                
                for _, row in round_df.iterrows():
                    clue_text = row['clue_text']
                    clue_type = row['clue_type'].lower()
                    
                    if clue_type == 'informed':
                        informed_clues.append(clue_text)
                    elif clue_type == 'misinformed':
                        misinformed_clues.append(clue_text)
                    elif clue_type == 'fake':
                        fake_clues.append(clue_text)
                    elif clue_type == 'extra':
                        extra_clue = clue_text
                
                round_data = {
                    "answer": answer,
                    "choices": choices,
                    "informed_clues": informed_clues,
                    "misinformed_clues": misinformed_clues,
                    "fake_clues": fake_clues,
                    "extra_clue": extra_clue
                }
                
                rounds_data.append(round_data)
            
            # Generate game name from round answers
            if len(rounds_data) >= 2:
                game_name = generate_game_name(
                    rounds_data[0]['answer'],
                    rounds_data[1]['answer']
                )
            elif len(rounds_data) == 1:
                game_name = rounds_data[0]['answer']
            else:
                game_name = f"Game_{test_run}"
            
            game = {
                "gameName": game_name,
                "id": generate_game_id(test_run, topic),
                "rounds": rounds_data
            }
            
            result[topic_key]["games"].append(game)
    
    return result

## 7. Generate JSON Output

In [7]:
# Convert data
json_data = convert_csv_to_json(df, SELECTED_LANG)

# Display summary
print(f"Generated JSON structure for language: {SELECTED_LANG.value}")
print(f"\nTotal topic documents: {len(json_data)}")
print(f"\nTopics and game counts:")
for topic_key, data in json_data.items():
    print(f"  - {topic_key}: {len(data['games'])} games")

Generated JSON structure for language: EN

Total topic documents: 10

Topics and game counts:
  - EN_Books: 10 games
  - EN_Broadcast_Media: 10 games
  - EN_Food: 10 games
  - EN_Inventions: 10 games
  - EN_Nature: 10 games
  - EN_Places: 10 games
  - EN_Songs: 10 games
  - EN_Sports: 10 games
  - EN_Technology: 10 games
  - EN_Video_Games: 10 games


## 8. Preview Sample Game

In [8]:
# Show first game from first topic
first_topic = list(json_data.keys())[0]
first_game = json_data[first_topic]['games'][0]

print(f"Sample game from '{first_topic}':")
print(json.dumps(first_game, indent=2))

Sample game from 'EN_Books':
{
  "gameName": "Fantasy - Harry Potter and the Sorcerer's Stone",
  "id": "Books_001",
  "rounds": [
    {
      "answer": "Fantasy",
      "choices": [
        "Fantasy",
        "Sci-Fi",
        "Adventure"
      ],
      "informed_clues": [
        "This genre often features magic, mythical creatures, and imaginative worlds that defy the laws of reality.",
        "It typically involves quests, battles against evil, and the triumph of good over darkness in a fictional setting.",
        "The narrative often includes characters with special abilities, embarking on perilous journeys and overcoming challenges.",
        "These narratives often feature heroes, their journeys, and the conflicts they face, exploring personal growth and discovery along the way.",
        "Readers are often transported to realms where imagination knows no bounds, and anything is possible to occur.",
        "This type of storytelling frequently draws upon traditional tales, my

## 9. Save JSON Files
This will save a combined JSON file with all topics (e.g., `EN_all_games.json`)

In [9]:
# Save combined file
combined_file = OUTPUT_DIR / f"{SELECTED_LANG.value}_all_games.json"
with open(combined_file, 'w', encoding='utf-8') as f:
    json.dump(json_data, f, indent=2, ensure_ascii=False)
print(f"\nSaved combined file: {combined_file}")


Saved combined file: d:\MOSAIC\disinformer-clue-gen\EN\EN_all_games.json


## 10. Validation & Statistics

In [10]:
def validate_and_analyze(json_data: Dict) -> None:
    """Validate JSON structure and provide statistics."""
    
    print("=" * 60)
    print("VALIDATION & STATISTICS")
    print("=" * 60)
    
    total_games = 0
    total_rounds = 0
    total_informed_clues = 0
    total_misinformed_clues = 0
    total_fake_clues = 0
    total_extra_clues = 0
    
    for topic_key, topic_data in json_data.items():
        games = topic_data['games']
        total_games += len(games)
        
        for game in games:
            total_rounds += len(game['rounds'])
            
            for round_data in game['rounds']:
                total_informed_clues += len(round_data['informed_clues'])
                total_misinformed_clues += len(round_data['misinformed_clues'])
                total_fake_clues += len(round_data['fake_clues'])
                if round_data['extra_clue']:
                    total_extra_clues += 1
    
    print(f"\nLanguage Code: {SELECTED_LANG.value}")
    print(f"Total Topics: {len(json_data)}")
    print(f"Total Games: {total_games}")
    print(f"Total Rounds: {total_rounds}")
    print(f"\nClue Counts:")
    print(f"  - Informed clues: {total_informed_clues}")
    print(f"  - Misinformed clues: {total_misinformed_clues}")
    print(f"  - Fake clues: {total_fake_clues}")
    print(f"  - Extra clues: {total_extra_clues}")
    print(f"  - Total clues: {total_informed_clues + total_misinformed_clues + total_fake_clues + total_extra_clues}")
    
    # Detailed breakdown by topic
    print(f"\nDetailed Breakdown by Topic:")
    print("-" * 60)
    for topic_key, topic_data in sorted(json_data.items()):
        games = topic_data['games']
        topic_rounds = sum(len(g['rounds']) for g in games)
        print(f"  {topic_key}:")
        print(f"    Games: {len(games)}, Rounds: {topic_rounds}")

validate_and_analyze(json_data)

VALIDATION & STATISTICS

Language Code: EN
Total Topics: 10
Total Games: 100
Total Rounds: 200

Clue Counts:
  - Informed clues: 1800
  - Misinformed clues: 400
  - Fake clues: 600
  - Extra clues: 200
  - Total clues: 3000

Detailed Breakdown by Topic:
------------------------------------------------------------
  EN_Books:
    Games: 10, Rounds: 20
  EN_Broadcast_Media:
    Games: 10, Rounds: 20
  EN_Food:
    Games: 10, Rounds: 20
  EN_Inventions:
    Games: 10, Rounds: 20
  EN_Nature:
    Games: 10, Rounds: 20
  EN_Places:
    Games: 10, Rounds: 20
  EN_Songs:
    Games: 10, Rounds: 20
  EN_Sports:
    Games: 10, Rounds: 20
  EN_Technology:
    Games: 10, Rounds: 20
  EN_Video_Games:
    Games: 10, Rounds: 20
