# Seconda Elementare Dataset Analysis 📊

This notebook provides a comprehensive analysis of the `seconda-elementare-with-answer.json` dataset to identify missing data and ensure data quality.

## Objectives:
- Load and explore the dataset structure
- Identify missing data patterns
- Analyze data completeness
- Generate data quality report

## 1. Import Required Libraries

In [9]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("📚 Libraries imported successfully!")

📚 Libraries imported successfully!


## 2. Load the Dataset

In [10]:
# Define file path
file_path = Path("json-with-correct-answers/quinta-elementare_with_answers.json")

# Check if file exists
if not file_path.exists():
    # Try alternative locations
    alternative_paths = [
        Path("quinta-elementare_with_answers.json"),
        Path("json-with-correct-answers/quinta-elementare_with_answers.json"),
        Path("results-texts/quinta-elementare_with_answers.json")
    ]
    
    for alt_path in alternative_paths:
        if alt_path.exists():
            file_path = alt_path
            break
    else:
        print("❌ File not found. Available files:")
        for f in Path(".").glob("**/*.json"):
            print(f"  - {f}")
        raise FileNotFoundError("Dataset file not found")

print(f"📁 Loading dataset from: {file_path}")

# Load JSON data
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print("✅ Dataset loaded successfully!")
    print(f"📊 File size: {file_path.stat().st_size / 1024:.1f} KB")
    
    # Print the structure to understand the format
    print("\n🔍 DATA STRUCTURE:")
    print(f"Type: {type(data)}")
    if isinstance(data, dict):
        print(f"Keys: {list(data.keys())}")
    elif isinstance(data, list):
        print(f"Length: {len(data)}")
        if data:
            print(f"First item keys: {list(data[0].keys()) if isinstance(data[0], dict) else type(data[0])}")
    
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    raise

📁 Loading dataset from: json-with-correct-answers/quinta-elementare_with_answers.json
✅ Dataset loaded successfully!
📊 File size: 64.4 KB

🔍 DATA STRUCTURE:
Type: <class 'dict'>
Keys: ['dataset_name', 'language', 'stories']


## 3. Dataset Overview

In [11]:
# Explore dataset structure
print("🔍 DATASET STRUCTURE OVERVIEW")
print("=" * 50)

# Basic information
print(f"📚 Dataset Name: {data.get('dataset_name', 'Not specified')}")
print(f"🌍 Language: {data.get('language', 'Not specified')}")
print(f"📖 Total Stories: {len(data.get('stories', []))}")

# Check if enhanced with AI
if data.get('enhanced_with_ai'):
    print(f"🤖 Enhanced with AI: Yes (Model: {data.get('ai_model', 'Unknown')})")
    print(f"⏰ Enhancement Date: {data.get('enhancement_timestamp', 'Unknown')}")
else:
    print("🤖 Enhanced with AI: No")

print("\n" + "=" * 50)

# Stories overview
stories = data.get('stories', [])
if stories:
    print(f"\n📋 STORIES SUMMARY:")
    for i, story in enumerate(stories[:5], 1):
        print(f"{i:2d}. {story.get('title', 'No title')[:50]}")
        print(f"    Questions: {len(story.get('questions', []))}")
        print(f"    Has passage: {'Yes' if story.get('passage') else 'No'}")
        print(f"    Has source: {'Yes' if story.get('source') else 'No'}")
    
    if len(stories) > 5:
        print("    ...")
        last_story = stories[-1]
        print(f"{len(stories):2d}. {last_story.get('title', 'No title')[:50]}")
        print(f"    Questions: {len(last_story.get('questions', []))}")

# Total questions
total_questions = sum(len(story.get('questions', [])) for story in stories)
print(f"\n🔢 TOTAL QUESTIONS: {total_questions}")

# Questions with answers
questions_with_answers = 0
for story in stories:
    for q in story.get('questions', []):
        if 'correct_answer' in q:
            questions_with_answers += 1

print(f"✅ Questions with correct answers: {questions_with_answers}")
print(f"❌ Questions missing answers: {total_questions - questions_with_answers}")

completion_rate = (questions_with_answers / total_questions * 100) if total_questions > 0 else 0
print(f"📊 Completion rate: {completion_rate:.1f}%")

🔍 DATASET STRUCTURE OVERVIEW
📚 Dataset Name: letture_quinta_it_mcq
🌍 Language: it
📖 Total Stories: 17
🤖 Enhanced with AI: No


📋 STORIES SUMMARY:
 1. tre volte bau
    Questions: 10
    Has passage: Yes
    Has source: No
 2. il gatto osiride
    Questions: 10
    Has passage: Yes
    Has source: No
 3. verso i mari del sud
    Questions: 10
    Has passage: Yes
    Has source: No
 4. mi piaci, sai
    Questions: 10
    Has passage: Yes
    Has source: No
 5. la testimone oculare
    Questions: 10
    Has passage: Yes
    Has source: No
    ...
17. l’ultima estate, berlino 1961
    Questions: 10

🔢 TOTAL QUESTIONS: 170
✅ Questions with correct answers: 170
❌ Questions missing answers: 0
📊 Completion rate: 100.0%


## 4. Check for Missing Data

In [12]:
# Create a comprehensive missing data analysis
missing_data_report = {
    'stories_missing_title': [],
    'stories_missing_passage': [],
    'stories_missing_source': [],
    'stories_missing_questions': [],
    'questions_missing_text': [],
    'questions_missing_options': [],
    'questions_missing_correct_answer': [],
    'questions_incomplete_options': []
}

print("🔍 MISSING DATA ANALYSIS")
print("=" * 50)

# Analyze each story
for story_idx, story in enumerate(stories):
    story_title = story.get('title', f'Story {story_idx + 1}')
    
    # Check story-level missing data
    if not story.get('title'):
        missing_data_report['stories_missing_title'].append(story_idx + 1)
    
    if not story.get('passage'):
        missing_data_report['stories_missing_passage'].append((story_idx + 1, story_title))
    
    if not story.get('source'):
        missing_data_report['stories_missing_source'].append((story_idx + 1, story_title))
    
    questions = story.get('questions', [])
    if not questions:
        missing_data_report['stories_missing_questions'].append((story_idx + 1, story_title))
    
    # Check question-level missing data
    for q_idx, question in enumerate(questions):
        q_id = f"Story {story_idx + 1}, Q{q_idx + 1}"
        
        if not question.get('question'):
            missing_data_report['questions_missing_text'].append(q_id)
        
        options = question.get('options', [])
        if not options:
            missing_data_report['questions_missing_options'].append(q_id)
        elif len(options) != 3:
            missing_data_report['questions_incomplete_options'].append((q_id, len(options)))
        
        if 'correct_answer' not in question:
            missing_data_report['questions_missing_correct_answer'].append(q_id)

# Print missing data summary
print("📊 MISSING DATA SUMMARY:")
print(f"Stories missing title: {len(missing_data_report['stories_missing_title'])}")
print(f"Stories missing passage: {len(missing_data_report['stories_missing_passage'])}")
print(f"Stories missing source: {len(missing_data_report['stories_missing_source'])}")
print(f"Stories missing questions: {len(missing_data_report['stories_missing_questions'])}")
print(f"Questions missing text: {len(missing_data_report['questions_missing_text'])}")
print(f"Questions missing options: {len(missing_data_report['questions_missing_options'])}")
print(f"Questions with incomplete options: {len(missing_data_report['questions_incomplete_options'])}")
print(f"Questions missing correct answer: {len(missing_data_report['questions_missing_correct_answer'])}")

# Show details for significant issues
if missing_data_report['stories_missing_passage']:
    print(f"\n⚠️  Stories missing passage:")
    for story_num, title in missing_data_report['stories_missing_passage']:
        print(f"   Story {story_num}: {title}")

if missing_data_report['questions_missing_correct_answer']:
    print(f"\n⚠️  Questions missing correct answer (showing first 10):")
    for q_id in missing_data_report['questions_missing_correct_answer'][:10]:
        print(f"   {q_id}")
    if len(missing_data_report['questions_missing_correct_answer']) > 10:
        print(f"   ... and {len(missing_data_report['questions_missing_correct_answer']) - 10} more")

🔍 MISSING DATA ANALYSIS
📊 MISSING DATA SUMMARY:
Stories missing title: 0
Stories missing passage: 0
Stories missing source: 17
Stories missing questions: 0
Questions missing text: 0
Questions missing options: 0
Questions with incomplete options: 0
Questions missing correct answer: 0
