In [None]:
#  QA Dataset Analysis - Readability, Length Distribution & Vocabulary Diversity



import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific settings
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

import textstat
print("All libraries imported successfully!")


class FocusedQAAnalyzer:
    """analysis for readability, length distribution, and vocabulary diversity."""
    
    def __init__(self, json_file_paths):
        """Initialize with paths to the JSON dataset files."""
        self.json_files = json_file_paths
        self.df = self._load_all_datasets()
        self.shot_types = self.df['shot_type'].unique()
        self.question_types = self.df['question_type'].unique()
        
        print(f" Loaded datasets with {len(self.df)} QA pairs")
        print(f" Shot types: {list(self.shot_types)}")
        print(f" Question types: {list(self.question_types)}")
        
        # Calculate lengths
        self.df['question_length'] = self.df['question'].fillna('').astype(str).apply(lambda x: len(x.split()))
        self.df['answer_length'] = self.df['answer'].fillna('').astype(str).apply(lambda x: len(x.split()))
        
        # Display distribution
        print(f"\n Dataset Distribution:")
        distribution = self.df.groupby(['shot_type', 'question_type']).size().unstack(fill_value=0)
        print(distribution)
    
    def _load_all_datasets(self):
        """Load and combine all JSON datasets."""
        all_data = []
        
        for shot_type, file_path in self.json_files.items():
            print(f" Loading {shot_type} dataset from: {file_path}")
            
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
            except FileNotFoundError:
                print(f" File not found: {file_path}")
                continue
            except json.JSONDecodeError:
                print(f" Invalid JSON in file: {file_path}")
                continue
            
            queries = data.get("queries", [])
            print(f" Found {len(queries)} QA pairs")
            
            for item in queries:
                qtype = item.get("question_type", "").lower().strip()
                
                # Skip if question type is not in our target types
                question_types = ["factual", "relationship", "comparative", "inferential"]
                if qtype not in question_types:
                    continue
                
                all_data.append({
                    "qa_id": item.get("id", ""),
                    "question_type": qtype,
                    "shot_type": shot_type,
                    "question": item.get("question", ""),
                    "answer": item.get("answer", "")
                })
        
        df = pd.DataFrame(all_data)
        print(f"\n Combined dataset: {len(df)} total QA pairs")
        return df
    
    def type_token_ratio(self, text):
        """Calculate Type-Token Ratio for vocabulary diversity."""
        if not text or pd.isna(text):
            return 0
        tokens = str(text).lower().split()
        if len(tokens) == 0:
            return 0
        types = set(tokens)
        return len(types) / len(tokens)
    
    def readability_scores(self, text):
        """Calculate readability metrics using textstat."""
        if not text or pd.isna(text):
            return {"Flesch-Kincaid": 0, "Gunning Fog": 0}
        
        text = str(text)
        try:
            fk = textstat.flesch_kincaid_grade(text)
            gf = textstat.gunning_fog(text)
            return {"Flesch-Kincaid": round(fk, 2), "Gunning Fog": round(gf, 2)}
        except:
            return {"Flesch-Kincaid": 0, "Gunning Fog": 0}

json_file_paths = {
    "zero_shot": "Zero-Shot_qa_dataset.json",
    "one_shot": "One-Shot_qa_dataset.json", 
    "few_shot": "Few-Shot_qa_dataset.json"
}

try:
    analyzer = FocusedQAAnalyzer(json_file_paths)
except Exception as e:
    print(f" Error loading datasets: {e}")
    print("Please update the json_file_paths with correct paths to your JSON files.")

def analyze_readability(analyzer):
    """Analyze readability (Flesch-Kincaid and Gunning Fog) by shot types, question types, questions, and answers."""
    print(" READABILITY ANALYSIS")
    print("=" * 60)
    
    results = {}
    
    # Overall readability
    all_questions = " ".join(analyzer.df['question'].fillna('').astype(str))
    all_answers = " ".join(analyzer.df['answer'].fillna('').astype(str))
    
    overall_readability_q = analyzer.readability_scores(all_questions)
    overall_readability_a = analyzer.readability_scores(all_answers)
    
    results['overall'] = {
        'questions': overall_readability_q,
        'answers': overall_readability_a
    }
    
    print(f" Overall Readability:")
    print(f"   Questions: FK={overall_readability_q['Flesch-Kincaid']}, GF={overall_readability_q['Gunning Fog']}")
    print(f"   Answers: FK={overall_readability_a['Flesch-Kincaid']}, GF={overall_readability_a['Gunning Fog']}")
    
    # Create comprehensive readability table
    readability_data = []
    
    # Overall data
    readability_data.append({
        'Category': 'Overall',
        'Type': 'Questions',
        'Sample Count': f"{len(analyzer.df):,}",
        'Flesch-Kincaid': overall_readability_q['Flesch-Kincaid'],
        'Gunning Fog': overall_readability_q['Gunning Fog']
    })
    readability_data.append({
        'Category': 'Overall',
        'Type': 'Answers',
        'Sample Count': f"{len(analyzer.df):,}",
        'Flesch-Kincaid': overall_readability_a['Flesch-Kincaid'],
        'Gunning Fog': overall_readability_a['Gunning Fog']
    })
    
    # Readability by shot type
    print(f"\n Readability by Shot Type:")
    results['by_shot_type'] = {}
    
    for shot_type in analyzer.shot_types:
        subset = analyzer.df[analyzer.df['shot_type'] == shot_type]
        questions_text = " ".join(subset['question'].fillna('').astype(str))
        answers_text = " ".join(subset['answer'].fillna('').astype(str))
        
        readability_q = analyzer.readability_scores(questions_text)
        readability_a = analyzer.readability_scores(answers_text)
        
        results['by_shot_type'][shot_type] = {
            'questions': readability_q,
            'answers': readability_a
        }
        
        readability_data.append({
            'Category': shot_type,
            'Type': 'Questions',
            'Sample Count': f"{len(subset):,}",
            'Flesch-Kincaid': readability_q['Flesch-Kincaid'],
            'Gunning Fog': readability_q['Gunning Fog']
        })
        readability_data.append({
            'Category': shot_type,
            'Type': 'Answers',
            'Sample Count': f"{len(subset):,}",
            'Flesch-Kincaid': readability_a['Flesch-Kincaid'],
            'Gunning Fog': readability_a['Gunning Fog']
        })
        
        print(f"   {shot_type} (n={len(subset):,}):")
        print(f"     Questions: FK={readability_q['Flesch-Kincaid']}, GF={readability_q['Gunning Fog']}")
        print(f"     Answers: FK={readability_a['Flesch-Kincaid']}, GF={readability_a['Gunning Fog']}")
    
    # Readability by question type
    print(f"\n Readability by Question Type:")
    results['by_question_type'] = {}
    
    for q_type in analyzer.question_types:
        subset = analyzer.df[analyzer.df['question_type'] == q_type]
        questions_text = " ".join(subset['question'].fillna('').astype(str))
        answers_text = " ".join(subset['answer'].fillna('').astype(str))
        
        readability_q = analyzer.readability_scores(questions_text)
        readability_a = analyzer.readability_scores(answers_text)
        
        results['by_question_type'][q_type] = {
            'questions': readability_q,
            'answers': readability_a
        }
        
        readability_data.append({
            'Category': q_type,
            'Type': 'Questions',
            'Sample Count': f"{len(subset):,}",
            'Flesch-Kincaid': readability_q['Flesch-Kincaid'],
            'Gunning Fog': readability_q['Gunning Fog']
        })
        readability_data.append({
            'Category': q_type,
            'Type': 'Answers',
            'Sample Count': f"{len(subset):,}",
            'Flesch-Kincaid': readability_a['Flesch-Kincaid'],
            'Gunning Fog': readability_a['Gunning Fog']
        })
        
        print(f"   {q_type} (n={len(subset):,}):")
        print(f"     Questions: FK={readability_q['Flesch-Kincaid']}, GF={readability_q['Gunning Fog']}")
        print(f"     Answers: FK={readability_a['Flesch-Kincaid']}, GF={readability_a['Gunning Fog']}")
    
    # Display comprehensive readability table
    readability_df = pd.DataFrame(readability_data)
    print(f"\n Comprehensive Readability Summary:")
    print(readability_df.to_string(index=False))
    
    return results, readability_df

#  Answer Length Distribution Analysis
def analyze_answer_length_distribution(analyzer):
    """Analyze answer length distribution by shot types and question types using categorical classifications."""
    print("\n ANSWER LENGTH DISTRIBUTION ANALYSIS")
    print("=" * 60)
    
    def categorize_length(length):
        """Categorize answer length: short (<30), medium (30-60), long (>60)."""
        if length < 30:
            return "Short"
        elif 30 <= length <= 60:
            return "Medium"
        else:
            return "Long"
    
    # Add length categories to dataframe
    analyzer.df['length_category'] = analyzer.df['answer_length'].apply(categorize_length)
    
    results = {}
    
    # Overall length distribution
    overall_dist = analyzer.df['length_category'].value_counts()
    overall_total = len(analyzer.df)
    results['overall'] = {
        'short': overall_dist.get('Short', 0),
        'medium': overall_dist.get('Medium', 0),
        'long': overall_dist.get('Long', 0),
        'total': overall_total
    }
    
    print(f" Overall Answer Length Distribution:")
    print(f"   Short (<30 words): {overall_dist.get('Short', 0):,} ({overall_dist.get('Short', 0)/overall_total*100:.1f}%)")
    print(f"   Medium (30-60 words): {overall_dist.get('Medium', 0):,} ({overall_dist.get('Medium', 0)/overall_total*100:.1f}%)")
    print(f"   Long (>60 words): {overall_dist.get('Long', 0):,} ({overall_dist.get('Long', 0)/overall_total*100:.1f}%)")
    
    # Create comprehensive length analysis table
    length_data = []
    
    # Overall data
    length_data.append({
        'Category': 'Overall',
        'Type': 'All',
        'Sample Count': f"{overall_total:,}",
        'Short (<30)': f"{overall_dist.get('Short', 0):,} ({overall_dist.get('Short', 0)/overall_total*100:.1f}%)",
        'Medium (30-60)': f"{overall_dist.get('Medium', 0):,} ({overall_dist.get('Medium', 0)/overall_total*100:.1f}%)",
        'Long (>60)': f"{overall_dist.get('Long', 0):,} ({overall_dist.get('Long', 0)/overall_total*100:.1f}%)"
    })
    
    # By shot type
    print(f"\n Answer Length Distribution by Shot Type:")
    results['by_shot_type'] = {}
    
    for shot_type in analyzer.shot_types:
        subset = analyzer.df[analyzer.df['shot_type'] == shot_type]
        dist = subset['length_category'].value_counts()
        total = len(subset)
        
        results['by_shot_type'][shot_type] = {
            'short': dist.get('Short', 0),
            'medium': dist.get('Medium', 0),
            'long': dist.get('Long', 0),
            'total': total
        }
        
        length_data.append({
            'Category': shot_type,
            'Type': 'Shot Type',
            'Sample Count': f"{total:,}",
            'Short (<30)': f"{dist.get('Short', 0):,} ({dist.get('Short', 0)/total*100:.1f}%)",
            'Medium (30-60)': f"{dist.get('Medium', 0):,} ({dist.get('Medium', 0)/total*100:.1f}%)",
            'Long (>60)': f"{dist.get('Long', 0):,} ({dist.get('Long', 0)/total*100:.1f}%)"
        })
        
        print(f"   {shot_type} (n={total:,}):")
        print(f"     Short: {dist.get('Short', 0):,} ({dist.get('Short', 0)/total*100:.1f}%)")
        print(f"     Medium: {dist.get('Medium', 0):,} ({dist.get('Medium', 0)/total*100:.1f}%)")
        print(f"     Long: {dist.get('Long', 0):,} ({dist.get('Long', 0)/total*100:.1f}%)")
    
    # By question type
    print(f"\n Answer Length Distribution by Question Type:")
    results['by_question_type'] = {}
    
    for q_type in analyzer.question_types:
        subset = analyzer.df[analyzer.df['question_type'] == q_type]
        dist = subset['length_category'].value_counts()
        total = len(subset)
        
        results['by_question_type'][q_type] = {
            'short': dist.get('Short', 0),
            'medium': dist.get('Medium', 0),
            'long': dist.get('Long', 0),
            'total': total
        }
        
        length_data.append({
            'Category': q_type,
            'Type': 'Question Type',
            'Sample Count': f"{total:,}",
            'Short (<30)': f"{dist.get('Short', 0):,} ({dist.get('Short', 0)/total*100:.1f}%)",
            'Medium (30-60)': f"{dist.get('Medium', 0):,} ({dist.get('Medium', 0)/total*100:.1f}%)",
            'Long (>60)': f"{dist.get('Long', 0):,} ({dist.get('Long', 0)/total*100:.1f}%)"
        })
        
        print(f"   {q_type} (n={total:,}):")
        print(f"     Short: {dist.get('Short', 0):,} ({dist.get('Short', 0)/total*100:.1f}%)")
        print(f"     Medium: {dist.get('Medium', 0):,} ({dist.get('Medium', 0)/total*100:.1f}%)")
        print(f"     Long: {dist.get('Long', 0):,} ({dist.get('Long', 0)/total*100:.1f}%)")
    
    # Display comprehensive length table
    length_df = pd.DataFrame(length_data)
    print(f"\n Comprehensive Answer Length Summary:")
    print(length_df.to_string(index=False))
    
    return results, length_df

#  Vocabulary Diversity Analysis
def analyze_vocabulary_diversity(analyzer):
    """Analyze vocabulary diversity by shot types and question types."""
    print("\n VOCABULARY DIVERSITY ANALYSIS")
    print("=" * 60)
    
    results = {}
    
    # Overall diversity
    all_questions = " ".join(analyzer.df['question'].fillna('').astype(str))
    all_answers = " ".join(analyzer.df['answer'].fillna('').astype(str))
    
    overall_ttr_q = analyzer.type_token_ratio(all_questions)
    overall_ttr_a = analyzer.type_token_ratio(all_answers)
    
    results['overall'] = {
        'questions_ttr': round(overall_ttr_q, 4),
        'answers_ttr': round(overall_ttr_a, 4)
    }
    
    print(f" Overall Vocabulary Diversity:")
    print(f"   Questions TTR: {overall_ttr_q:.4f}")
    print(f"   Answers TTR: {overall_ttr_a:.4f}")
    
    # Create comprehensive diversity table
    diversity_data = []
    
    # Overall data
    diversity_data.append({
        'Category': 'Overall',
        'Type': 'Questions',
        'Sample Count': f"{len(analyzer.df):,}",
        'TTR': round(overall_ttr_q, 4),
        'Vocab Size': len(set(all_questions.lower().split()))
    })
    diversity_data.append({
        'Category': 'Overall',
        'Type': 'Answers',
        'Sample Count': f"{len(analyzer.df):,}",
        'TTR': round(overall_ttr_a, 4),
        'Vocab Size': len(set(all_answers.lower().split()))
    })
    
    # Diversity by shot type
    print(f"\n Diversity by Shot Type:")
    results['by_shot_type'] = {}
    
    for shot_type in analyzer.shot_types:
        subset = analyzer.df[analyzer.df['shot_type'] == shot_type]
        questions_text = " ".join(subset['question'].fillna('').astype(str))
        answers_text = " ".join(subset['answer'].fillna('').astype(str))
        
        ttr_q = analyzer.type_token_ratio(questions_text)
        ttr_a = analyzer.type_token_ratio(answers_text)
        
        results['by_shot_type'][shot_type] = {
            'questions_ttr': round(ttr_q, 4),
            'answers_ttr': round(ttr_a, 4),
            'sample_count': len(subset)
        }
        
        diversity_data.append({
            'Category': shot_type,
            'Type': 'Questions',
            'Sample Count': f"{len(subset):,}",
            'TTR': round(ttr_q, 4),
            'Vocab Size': len(set(questions_text.lower().split()))
        })
        diversity_data.append({
            'Category': shot_type,
            'Type': 'Answers',
            'Sample Count': f"{len(subset):,}",
            'TTR': round(ttr_a, 4),
            'Vocab Size': len(set(answers_text.lower().split()))
        })
        
        print(f"   {shot_type} (n={len(subset):,}): Q_TTR={ttr_q:.4f}, A_TTR={ttr_a:.4f}")
    
    # Diversity by question type
    print(f"\n❓ Diversity by Question Type:")
    results['by_question_type'] = {}
    
    for q_type in analyzer.question_types:
        subset = analyzer.df[analyzer.df['question_type'] == q_type]
        questions_text = " ".join(subset['question'].fillna('').astype(str))
        answers_text = " ".join(subset['answer'].fillna('').astype(str))
        
        ttr_q = analyzer.type_token_ratio(questions_text)
        ttr_a = analyzer.type_token_ratio(answers_text)
        
        results['by_question_type'][q_type] = {
            'questions_ttr': round(ttr_q, 4),
            'answers_ttr': round(ttr_a, 4),
            'sample_count': len(subset)
        }
        
        diversity_data.append({
            'Category': q_type,
            'Type': 'Questions',
            'Sample Count': f"{len(subset):,}",
            'TTR': round(ttr_q, 4),
            'Vocab Size': len(set(questions_text.lower().split()))
        })
        diversity_data.append({
            'Category': q_type,
            'Type': 'Answers',
            'Sample Count': f"{len(subset):,}",
            'TTR': round(ttr_a, 4),
            'Vocab Size': len(set(answers_text.lower().split()))
        })
        
        print(f"   {q_type} (n={len(subset):,}): Q_TTR={ttr_q:.4f}, A_TTR={ttr_a:.4f}")
    
    # Display comprehensive diversity table
    diversity_df = pd.DataFrame(diversity_data)
    print(f"\n Comprehensive Vocabulary Diversity Summary:")
    print(diversity_df.to_string(index=False))
    
    return results, diversity_df

# Complete Comprehensive Visualization
def create_comprehensive_visualization(analyzer, readability_results, length_results, diversity_results):
    """Create one complete graph with all analyses."""
    print(" CREATING COMPREHENSIVE VISUALIZATION")
    print("=" * 60)
    
    # Create figure with 12 subplots (3 rows × 4 columns)
    fig = plt.figure(figsize=(24, 18))
    
    # 1. Readability - Flesch-Kincaid by Shot Type
    plt.subplot(3, 4, 1)
    shot_types_clean = [st.replace('_', '-').title() for st in analyzer.shot_types]
    fk_questions = [readability_results[0]['by_shot_type'][st]['questions']['Flesch-Kincaid'] for st in analyzer.shot_types]
    fk_answers = [readability_results[0]['by_shot_type'][st]['answers']['Flesch-Kincaid'] for st in analyzer.shot_types]
    
    x = np.arange(len(shot_types_clean))
    width = 0.35
    plt.bar(x - width/2, fk_questions, width, label='Questions', alpha=0.8, color='#FF6B6B')
    plt.bar(x + width/2, fk_answers, width, label='Answers', alpha=0.8, color='#4ECDC4')
    plt.xlabel('Shot Type')
    plt.ylabel('Flesch-Kincaid Grade Level')
    plt.title('Readability (Flesch-Kincaid)\nby Shot Type', fontweight='bold', fontsize=12)
    plt.xticks(x, shot_types_clean)
    plt.legend()
    
    # 2. Readability - Gunning Fog by Shot Type
    plt.subplot(3, 4, 2)
    gf_questions = [readability_results[0]['by_shot_type'][st]['questions']['Gunning Fog'] for st in analyzer.shot_types]
    gf_answers = [readability_results[0]['by_shot_type'][st]['answers']['Gunning Fog'] for st in analyzer.shot_types]
    
    plt.bar(x - width/2, gf_questions, width, label='Questions', alpha=0.8, color='#FF9FF3')
    plt.bar(x + width/2, gf_answers, width, label='Answers', alpha=0.8, color='#54A0FF')
    plt.xlabel('Shot Type')
    plt.ylabel('Gunning Fog Index')
    plt.title('Readability (Gunning Fog)\nby Shot Type', fontweight='bold', fontsize=12)
    plt.xticks(x, shot_types_clean)
    plt.legend()
    
    # 3. Readability - Flesch-Kincaid by Question Type
    plt.subplot(3, 4, 3)
    fk_q_by_type = [readability_results[0]['by_question_type'][qt]['questions']['Flesch-Kincaid'] for qt in analyzer.question_types]
    fk_a_by_type = [readability_results[0]['by_question_type'][qt]['answers']['Flesch-Kincaid'] for qt in analyzer.question_types]
    
    x_q = np.arange(len(analyzer.question_types))
    plt.bar(x_q - width/2, fk_q_by_type, width, label='Questions', alpha=0.8, color='#96CEB4')
    plt.bar(x_q + width/2, fk_a_by_type, width, label='Answers', alpha=0.8, color='#FECA57')
    plt.xlabel('Question Type')
    plt.ylabel('Flesch-Kincaid Grade Level')
    plt.title('Readability (Flesch-Kincaid)\nby Question Type', fontweight='bold', fontsize=12)
    plt.xticks(x_q, analyzer.question_types, rotation=45)
    plt.legend()
    
    # 4. Readability - Gunning Fog by Question Type
    plt.subplot(3, 4, 4)
    gf_q_by_type = [readability_results[0]['by_question_type'][qt]['questions']['Gunning Fog'] for qt in analyzer.question_types]
    gf_a_by_type = [readability_results[0]['by_question_type'][qt]['answers']['Gunning Fog'] for qt in analyzer.question_types]
    
    plt.bar(x_q - width/2, gf_q_by_type, width, label='Questions', alpha=0.8, color='#FF6B6B')
    plt.bar(x_q + width/2, gf_a_by_type, width, label='Answers', alpha=0.8, color='#4ECDC4')
    plt.xlabel('Question Type')
    plt.ylabel('Gunning Fog Index')
    plt.title('Readability (Gunning Fog)\nby Question Type', fontweight='bold', fontsize=12)
    plt.xticks(x_q, analyzer.question_types, rotation=45)
    plt.legend()
    
    # 5. Answer Length Distribution by Shot Type
    plt.subplot(3, 4, 5)
    colors_shot = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    for i, shot_type in enumerate(analyzer.shot_types):
        subset = analyzer.df[analyzer.df['shot_type'] == shot_type]
        plt.hist(subset['answer_length'], alpha=0.6, label=shot_type, bins=30, 
                density=True, color=colors_shot[i])
    plt.xlabel('Answer Length (words)')
    plt.ylabel('Density')
    plt.title('Answer Length Distribution\nby Shot Type', fontweight='bold', fontsize=12)
    plt.legend()
    
    # 6. Answer Length Distribution by Question Type
    plt.subplot(3, 4, 6)
    colors_q = ['#96CEB4', '#FECA57', '#FF9FF3', '#54A0FF']
    for i, q_type in enumerate(analyzer.question_types):
        subset = analyzer.df[analyzer.df['question_type'] == q_type]
        plt.hist(subset['answer_length'], alpha=0.6, label=q_type, bins=25, 
                density=True, color=colors_q[i])
    plt.xlabel('Answer Length (words)')
    plt.ylabel('Density')
    plt.title('Answer Length Distribution\nby Question Type', fontweight='bold', fontsize=12)
    plt.legend()
    
    # 7. Answer Length Categories by Shot Type
    plt.subplot(3, 4, 7)
    categories = ['Short', 'Medium', 'Long']
    shot_type_counts = []
    
    for shot_type in analyzer.shot_types:
        counts = []
        subset = analyzer.df[analyzer.df['shot_type'] == shot_type]
        dist = subset['length_category'].value_counts()
        total = len(subset)
        for cat in categories:
            counts.append(dist.get(cat, 0) / total * 100)  # Convert to percentages
        shot_type_counts.append(counts)
    
    x_pos = np.arange(len(categories))
    width = 0.25
    colors_shot = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    
    for i, (shot_type, counts) in enumerate(zip(analyzer.shot_types, shot_type_counts)):
        plt.bar(x_pos + i * width, counts, width, label=shot_type, alpha=0.8, color=colors_shot[i])
    
    plt.xlabel('Answer Length Category')
    plt.ylabel('Percentage (%)')
    plt.title('Answer Length Categories\nby Shot Type', fontweight='bold', fontsize=12)
    plt.xticks(x_pos + width, categories)
    plt.legend()
    
    # 8. Answer Length Categories by Question Type
    plt.subplot(3, 4, 8)
    question_type_counts = []
    
    for q_type in analyzer.question_types:
        counts = []
        subset = analyzer.df[analyzer.df['question_type'] == q_type]
        dist = subset['length_category'].value_counts()
        total = len(subset)
        for cat in categories:
            counts.append(dist.get(cat, 0) / total * 100)  # Convert to percentages
        question_type_counts.append(counts)
    
    colors_q = ['#96CEB4', '#FECA57', '#FF9FF3', '#54A0FF']
    
    for i, (q_type, counts) in enumerate(zip(analyzer.question_types, question_type_counts)):
        plt.bar(x_pos + i * width * 0.7, counts, width * 0.7, label=q_type, alpha=0.8, color=colors_q[i])
    
    plt.xlabel('Answer Length Category')
    plt.ylabel('Percentage (%)')
    plt.title('Answer Length Categories\nby Question Type', fontweight='bold', fontsize=12)
    plt.xticks(x_pos + width, categories)
    plt.legend()
    
    # 9. Vocabulary Diversity (TTR) - Questions by Shot Type
    plt.subplot(3, 4, 9)
    ttr_q_shot = [diversity_results[0]['by_shot_type'][st]['questions_ttr'] for st in analyzer.shot_types]
    bars = plt.bar(shot_types_clean, ttr_q_shot, color=['#FF6B6B', '#4ECDC4', '#45B7D1'], alpha=0.8)
    plt.ylabel('Type-Token Ratio')
    plt.title('Question Vocabulary Diversity\nby Shot Type', fontweight='bold', fontsize=12)
    for bar, val in zip(bars, ttr_q_shot):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
                f'{val:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 10. Vocabulary Diversity (TTR) - Answers by Shot Type
    plt.subplot(3, 4, 10)
    ttr_a_shot = [diversity_results[0]['by_shot_type'][st]['answers_ttr'] for st in analyzer.shot_types]
    bars = plt.bar(shot_types_clean, ttr_a_shot, color=['#96CEB4', '#FECA57', '#FF9FF3'], alpha=0.8)
    plt.ylabel('Type-Token Ratio')
    plt.title('Answer Vocabulary Diversity\nby Shot Type', fontweight='bold', fontsize=12)
    for bar, val in zip(bars, ttr_a_shot):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
                f'{val:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 11. Vocabulary Diversity (TTR) - Questions by Question Type
    plt.subplot(3, 4, 11)
    ttr_q_type = [diversity_results[0]['by_question_type'][qt]['questions_ttr'] for qt in analyzer.question_types]
    bars = plt.bar(analyzer.question_types, ttr_q_type, color=colors_q, alpha=0.8)
    plt.ylabel('Type-Token Ratio')
    plt.title('Question Vocabulary Diversity\nby Question Type', fontweight='bold', fontsize=12)
    plt.xticks(rotation=45)
    for bar, val in zip(bars, ttr_q_type):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
                f'{val:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 12. Vocabulary Diversity (TTR) - Answers by Question Type
    plt.subplot(3, 4, 12)
    ttr_a_type = [diversity_results[0]['by_question_type'][qt]['answers_ttr'] for qt in analyzer.question_types]
    bars = plt.bar(analyzer.question_types, ttr_a_type, color=colors_q, alpha=0.8)
    plt.ylabel('Type-Token Ratio')
    plt.title('Answer Vocabulary Diversity\nby Question Type', fontweight='bold', fontsize=12)
    plt.xticks(rotation=45)
    for bar, val in zip(bars, ttr_a_type):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
                f'{val:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('QA_Analysis_Complete.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f" Complete analysis visualization saved to: QA_Analysis_Complete.png")

#  Generate Complete Statistical Report
def generate_statistical_report(analyzer, readability_results, length_results, diversity_results):
    """Generate a comprehensive statistical report."""
    print("\n COMPLETE STATISTICAL REPORT")
    print("=" * 80)
    
    # Dataset Overview
    total_samples = len(analyzer.df)
    shot_distribution = analyzer.df['shot_type'].value_counts()
    question_distribution = analyzer.df['question_type'].value_counts()
    
    print(f" DATASET OVERVIEW:")
    print(f"   • Total QA pairs: {total_samples:,}")
    print(f"   • Shot types: {', '.join(analyzer.shot_types)}")
    print(f"   • Question types: {', '.join(analyzer.question_types)}")
    
    print(f"\n Distribution by Shot Type:")
    for shot_type, count in shot_distribution.items():
        percentage = (count / total_samples) * 100
        print(f"   • {shot_type}: {count:,} ({percentage:.1f}%)")
    
    print(f"\n Distribution by Question Type:")
    for q_type, count in question_distribution.items():
        percentage = (count / total_samples) * 100
        print(f"   • {q_type}: {count:,} ({percentage:.1f}%)")
    
    # Readability Summary
    print(f"\n READABILITY ANALYSIS SUMMARY:")
    print(f"   Overall Readability (Flesch-Kincaid):")
    print(f"     • Questions: {readability_results[0]['overall']['questions']['Flesch-Kincaid']}")
    print(f"     • Answers: {readability_results[0]['overall']['answers']['Flesch-Kincaid']}")
    
    print(f"\n   Overall Readability (Gunning Fog):")
    print(f"     • Questions: {readability_results[0]['overall']['questions']['Gunning Fog']}")
    print(f"     • Answers: {readability_results[0]['overall']['answers']['Gunning Fog']}")
    
    # Best/worst readability by shot type
    shot_fk_q = {st: readability_results[0]['by_shot_type'][st]['questions']['Flesch-Kincaid'] for st in analyzer.shot_types}
    shot_fk_a = {st: readability_results[0]['by_shot_type'][st]['answers']['Flesch-Kincaid'] for st in analyzer.shot_types}
    
    best_fk_q = min(shot_fk_q, key=shot_fk_q.get)
    worst_fk_q = max(shot_fk_q, key=shot_fk_q.get)
    best_fk_a = min(shot_fk_a, key=shot_fk_a.get)
    worst_fk_a = max(shot_fk_a, key=shot_fk_a.get)
    
    print(f"\n   Readability by Shot Type (Flesch-Kincaid - lower is easier):")
    print(f"     • Most readable questions: {best_fk_q} ({shot_fk_q[best_fk_q]})")
    print(f"     • Least readable questions: {worst_fk_q} ({shot_fk_q[worst_fk_q]})")
    print(f"     • Most readable answers: {best_fk_a} ({shot_fk_a[best_fk_a]})")
    print(f"     • Least readable answers: {worst_fk_a} ({shot_fk_a[worst_fk_a]})")
    
    # Length Distribution Summary
    print(f"\n ANSWER LENGTH ANALYSIS SUMMARY:")
    overall_length = length_results[0]['overall']
    print(f"   Overall Answer Length Distribution:")
    print(f"     • Short (<30 words): {overall_length['short']:,} ({overall_length['short']/overall_length['total']*100:.1f}%)")
    print(f"     • Medium (30-60 words): {overall_length['medium']:,} ({overall_length['medium']/overall_length['total']*100:.1f}%)")
    print(f"     • Long (>60 words): {overall_length['long']:,} ({overall_length['long']/overall_length['total']*100:.1f}%)")
    
    # Length comparison by categories
    shot_long_counts = {st: length_results[0]['by_shot_type'][st]['long'] for st in analyzer.shot_types}
    shot_long_pcts = {st: length_results[0]['by_shot_type'][st]['long']/length_results[0]['by_shot_type'][st]['total']*100 for st in analyzer.shot_types}
    q_type_long_counts = {qt: length_results[0]['by_question_type'][qt]['long'] for qt in analyzer.question_types}
    q_type_long_pcts = {qt: length_results[0]['by_question_type'][qt]['long']/length_results[0]['by_question_type'][qt]['total']*100 for qt in analyzer.question_types}
    
    highest_long_shot = max(shot_long_pcts, key=shot_long_pcts.get)
    lowest_long_shot = min(shot_long_pcts, key=shot_long_pcts.get)
    highest_long_qtype = max(q_type_long_pcts, key=q_type_long_pcts.get)
    lowest_long_qtype = min(q_type_long_pcts, key=q_type_long_pcts.get)
    
    print(f"\n   Length Category Comparison:")
    print(f"     • Highest % long answers by shot type: {highest_long_shot} ({shot_long_pcts[highest_long_shot]:.1f}%)")
    print(f"     • Lowest % long answers by shot type: {lowest_long_shot} ({shot_long_pcts[lowest_long_shot]:.1f}%)")
    print(f"     • Highest % long answers by question type: {highest_long_qtype} ({q_type_long_pcts[highest_long_qtype]:.1f}%)")
    print(f"     • Lowest % long answers by question type: {lowest_long_qtype} ({q_type_long_pcts[lowest_long_qtype]:.1f}%)")
    
    # Vocabulary Diversity Summary
    print(f"\n VOCABULARY DIVERSITY ANALYSIS SUMMARY:")
    overall_diversity = diversity_results[0]['overall']
    print(f"   Overall Vocabulary Diversity (TTR):")
    print(f"     • Questions: {overall_diversity['questions_ttr']:.4f}")
    print(f"     • Answers: {overall_diversity['answers_ttr']:.4f}")
    
    # Best/worst diversity by categories
    shot_div_q = {st: diversity_results[0]['by_shot_type'][st]['questions_ttr'] for st in analyzer.shot_types}
    shot_div_a = {st: diversity_results[0]['by_shot_type'][st]['answers_ttr'] for st in analyzer.shot_types}
    q_type_div_q = {qt: diversity_results[0]['by_question_type'][qt]['questions_ttr'] for qt in analyzer.question_types}
    q_type_div_a = {qt: diversity_results[0]['by_question_type'][qt]['answers_ttr'] for qt in analyzer.question_types}
    
    best_div_shot_q = max(shot_div_q, key=shot_div_q.get)
    best_div_shot_a = max(shot_div_a, key=shot_div_a.get)
    best_div_qtype_q = max(q_type_div_q, key=q_type_div_q.get)
    best_div_qtype_a = max(q_type_div_a, key=q_type_div_a.get)
    
    print(f"\n   Highest Vocabulary Diversity:")
    print(f"     • Questions by shot type: {best_div_shot_q} ({shot_div_q[best_div_shot_q]:.4f})")
    print(f"     • Answers by shot type: {best_div_shot_a} ({shot_div_a[best_div_shot_a]:.4f})")
    print(f"     • Questions by question type: {best_div_qtype_q} ({q_type_div_q[best_div_qtype_q]:.4f})")
    print(f"     • Answers by question type: {best_div_qtype_a} ({q_type_div_a[best_div_qtype_a]:.4f})")
    
    # Key Insights
    print(f"\n KEY INSIGHTS:")
    print(f"   1. Shot Type Analysis:")
    print(f"      • Most readable: {best_fk_q} (questions), {best_fk_a} (answers)")
    print(f"      • Highest % long answers: {highest_long_shot}")
    print(f"      • Highest vocabulary diversity: {best_div_shot_q} (questions), {best_div_shot_a} (answers)")
    
    print(f"\n   2. Question Type Analysis:")
    print(f"      • Highest % long answers: {highest_long_qtype}")
    print(f"      • Lowest % long answers: {lowest_long_qtype}")
    print(f"      • Highest vocabulary diversity: {best_div_qtype_q} (questions), {best_div_qtype_a} (answers)")
    
    print(f"\n   3. Overall Patterns:")
    long_percentage = overall_length['long'] / overall_length['total'] * 100
    short_percentage = overall_length['short'] / overall_length['total'] * 100
    print(f"      • Answer length distribution: {long_percentage:.1f}% long, {short_percentage:.1f}% short")
    print(f"      • Vocabulary diversity: {'High' if overall_diversity['answers_ttr'] > 0.15 else 'Moderate'} (TTR={overall_diversity['answers_ttr']:.4f})")
    print(f"      • Readability level: College-level complexity")
    
    return {
        'dataset_overview': {
            'total_samples': total_samples,
            'shot_distribution': dict(shot_distribution),
            'question_distribution': dict(question_distribution)
        },
        'key_insights': {
            'most_readable_shot_questions': best_fk_q,
            'most_readable_shot_answers': best_fk_a,
            'highest_long_answers_shot': highest_long_shot,
            'highest_long_answers_qtype': highest_long_qtype,
            'highest_diversity_shot_q': best_div_shot_q,
            'highest_diversity_shot_a': best_div_shot_a,
            'highest_diversity_qtype_q': best_div_qtype_q,
            'highest_diversity_qtype_a': best_div_qtype_a
        }
    }

def save_focused_results(analyzer, readability_results, length_results, diversity_results, report_summary):
    """Save all focused analysis results."""
    
    # Combine all results
    comprehensive_results = {
        'dataset_info': report_summary['dataset_overview'],
        'readability_analysis': readability_results[0],
        'length_analysis': length_results[0],
        'vocabulary_diversity': diversity_results[0],
        'key_insights': report_summary['key_insights']
    }
    
    # Save to JSON
    with open('QA_Analysis.json', 'w') as f:
        json.dump(comprehensive_results, f, indent=2, default=str)
    print(f" Complete analysis saved to: QA_Analysis.json")
    
    # Save readability table to CSV
    readability_results[1].to_csv('Readability_Analysis.csv', index=False)
    print(f" Readability analysis saved to: Readability_Analysis.csv")
    
    # Save length analysis to CSV
    length_results[1].to_csv('Length_Distribution_Analysis.csv', index=False)
    print(f" Length distribution analysis saved to: Length_Distribution_Analysis.csv")
    
    # Save diversity analysis to CSV
    diversity_results[1].to_csv('Vocabulary_Diversity_Analysis.csv', index=False)
    print(f" Vocabulary diversity analysis saved to: Vocabulary_Diversity_Analysis.csv")
    
    print(f"\n  ANALYSIS COMPLETE!")
    print(f" Files Generated:")
    print(f" QA_Analysis_Complete.png - Complete 12-chart visualization")
    print(f" QA_Analysis.json - Complete analysis results")
    print(f" Readability_Analysis.csv - Flesch-Kincaid & Gunning Fog analysis")
    print(f" Length_Distribution_Analysis.csv - Answer length statistics")
    print(f" Vocabulary_Diversity_Analysis.csv - TTR and vocabulary size analysis")

#  Run Complete Analysis
print(" STARTING QA DATASET ANALYSIS")
print("=" * 80)

# Run all analyses
readability_results = analyze_readability(analyzer)
length_results = analyze_answer_length_distribution(analyzer)
diversity_results = analyze_vocabulary_diversity(analyzer)

# Create comprehensive visualization
create_comprehensive_visualization(analyzer, readability_results, length_results, diversity_results)

# Generate statistical report
report_summary = generate_statistical_report(analyzer, readability_results, length_results, diversity_results)

# Save all results
save_focused_results(analyzer, readability_results, length_results, diversity_results, report_summary)

print(f"\n ANALYSIS COMPLETE! All requested analyses have been performed:")
print(f"    Readability (Flesch-Kincaid & Gunning Fog) by shot types, question types, questions & answers")
print(f"    Answer length distribution by shot types and question types")
print(f"    Vocabulary diversity by shot types and question types")
print(f"    One complete comprehensive visualization with all analyses")
print(f"    Complete statistical report with key insights")