In [None]:
import json
import pandas as pd 
import numpy as np

In [None]:
import os
import pandas as pd
import re

path = '../data/SD_1_Classified_Unmerged_rw'
all_data = pd.DataFrame()

def parse_filename(filename):
    pattern = r"SD_(\w+)_\d+_.*_(\d+)_([\w+]+)"
    match = re.match(pattern, filename)
    if match:
        language = match.group(1) 
        number = match.group(2)    
        chapter = match.group(3)   
        return language, number, chapter
    return None, None, None

for root, _, files in os.walk(path):
    for json_file in files:
        if json_file.endswith('.json'):
            file_path = os.path.join(root, json_file)
            try:
                data = pd.read_json(file_path)
                filename = os.path.splitext(json_file)[0]
                language, number, chapter = parse_filename(filename)
                data['source_file'] = filename
                data['language'] = language
                data['number'] = number
                data['chapter'] = chapter
                all_data = pd.concat([all_data, data], ignore_index=True)
            except ValueError as e:
                print(f"Error reading {file_path}: {e}")

In [None]:
counts = all_data.groupby(['language', 'source_file']).size().reset_index(name='count')

In [None]:
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt

languages = counts['language'].unique()

for language in languages:
    lang_counts = counts[counts['language'] == language]['count']
    density = gaussian_kde(lang_counts)
    x_vals = np.linspace(min(lang_counts), max(lang_counts), 1000)
    y_vals = density(x_vals)
    plt.plot(x_vals, y_vals, label=language)
    
plt.title('Density Estimation for Dialogue Counts by Language')
plt.xlabel('Count')
plt.ylabel('Density')
plt.xlim(0, 30)
plt.legend(title="Language")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()