In [4]:

import pandas as pd
import random
from datetime import datetime, timedelta
import json

class User:
    def __init__(self, user_id, country, age_band, proficiency_level):
        self.user_id = user_id
        self.country = country
        self.age_band = age_band
        self.proficiency_level = proficiency_level
        self.interests = []
        self.error_history = {
            "grammar": {},
            "vocabulary": {},
            "pronunciation": {},
            "fluency": {}
        }
        self.exercise_history = []

class ErrorEntry:
    def __init__(self, category, subcategory, severity, timestamp):
        self.category = category
        self.subcategory = subcategory
        self.severity = severity
        self.timestamp = timestamp

class ExerciseEntry:
    def __init__(self, category, subcategory, format, content, user_performance, timestamp):
        self.category = category
        self.subcategory = subcategory
        self.format = format
        self.content = content
        self.user_performance = user_performance
        self.timestamp = timestamp

class UtteranceAnalysis:
    def __init__(self, user_id, utterance_id, timestamp, text, audio_file):
        self.user_id = user_id
        self.utterance_id = utterance_id
        self.timestamp = timestamp
        self.text = text
        self.audio_file = audio_file
        self.grammar = GrammarAnalysis()
        self.vocabulary = VocabularyAnalysis()
        self.pronunciation = PronunciationAnalysis()
        self.fluency = FluencyAnalysis()

class GrammarAnalysis:
    def __init__(self):
        self.overall_score = 0.0
        self.error_count = 0
        self.error_types = {
            "subject_verb_agreement": [],
            "tense_usage": [],
            "article_usage": [],
            "preposition_usage": [],
            "word_order": [],
            "conjunction_usage": [],
            "pronoun_usage": [],
            "sentence_structure": [],
            "punctuation": []
        }
    
    def add_error(self, error_type, severity, context):
        self.error_types[error_type].append({
            "severity": severity,
            "context": context
        })
        self.error_count += 1

class VocabularyAnalysis:
    def __init__(self):
        self.overall_score = 0.0
        self.unique_words_count = 0
        self.advanced_words_count = 0
        self.word_choice_errors = []
        self.collocations = []
        self.idiom_usage = []
        self.lexical_density = 0.0
        self.cefr_level = ""

    def add_word_choice_error(self, incorrect_word, suggested_word, context):
        self.word_choice_errors.append({
            "incorrect": incorrect_word,
            "suggested": suggested_word,
            "context": context
        })

class PronunciationAnalysis:
    def __init__(self):
        self.overall_score = 0.0
        self.phoneme_errors = []
        self.stress_errors = []
        self.intonation_score = 0.0
        self.rhythm_score = 0.0
        self.fluency_score = 0.0
        self.accent_similarity = ""

    def add_phoneme_error(self, incorrect_phoneme, correct_phoneme, word):
        self.phoneme_errors.append({
            "incorrect": incorrect_phoneme,
            "correct": correct_phoneme,
            "word": word
        })

class FluencyAnalysis:
    def __init__(self):
        self.overall_score = 0.0
        self.words_per_minute = 0
        self.filler_word_count = 0
        self.pause_count = 0
        self.avg_pause_duration = 0.0
        self.longest_fluent_segment = 0
        self.sentence_restarts = 0
        self.coherence_score = 0.0
        self.topic_relevance_score = 0.0


In [7]:
import random
from datetime import datetime, timedelta
import uuid
import json

# Helper function to generate timestamps
def random_timestamp(start, end):
    return (start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))).isoformat()

# Generate dataset
def generate_granular_dataset(num_users=100, days_of_history=30):
    start_date = datetime.now() - timedelta(days=days_of_history)
    end_date = datetime.now()

    users = []
    utterances = []
    errors = []
    exercises = []

    countries = ["Japan", "India", "USA", "Brazil", "France", "Nigeria", "China", "Australia"]
    age_bands = ["18-24", "25-34", "35-44", "45-54", "55+"]
    proficiency_levels = ["Beginner", "Intermediate", "Advanced"]
    interests = ["anime", "technology", "cooking", "sports", "music", "travel", "movies", "literature"]
    
    categories = ["grammar", "vocabulary", "pronunciation", "fluency"]
    subcategories = {
        "grammar": ["subject_verb_agreement", "tense_usage", "article_usage", "preposition_usage"],
        "vocabulary": ["word_choice", "idiomatic_expressions", "collocations", "academic_vocabulary"],
        "pronunciation": ["consonant_sounds", "vowel_sounds", "word_stress", "intonation"],
        "fluency": ["speaking_speed", "pausing", "coherence", "filler_words"]
    }
    exercise_formats = ["fill-in-the-blank", "multiple-choice", "audio-recording", "sentence-formation"]

    for i in range(num_users):
        user_id = str(uuid.uuid4())
        user = {
            "user_id": user_id,
            "country": random.choice(countries),
            "age_band": random.choice(age_bands),
            "proficiency_level": random.choice(proficiency_levels),
            "interests": random.sample(interests, random.randint(2, 5))
        }
        users.append(user)

        # Generate utterances for this user
        for _ in range(random.randint(10, 50)):
            utterance_id = str(uuid.uuid4())
            timestamp = random_timestamp(start_date, end_date)
            utterance = {
                "utterance_id": utterance_id,
                "user_id": user_id,
                "timestamp": timestamp,
                "text": f"Sample text for utterance {_}",
                "audio_file": f"audio_{utterance_id}.wav"
            }
            utterances.append(utterance)

            # Generate errors for this utterance
            for category in categories:
                if random.random() < 0.3:  # 30% chance of error in each category
                    subcategory = random.choice(subcategories[category])
                    error = {
                        "error_id": str(uuid.uuid4()),
                        "utterance_id": utterance_id,
                        "user_id": user_id,
                        "timestamp": timestamp,
                        "category": category,
                        "subcategory": subcategory,
                        "severity": random.uniform(0.3, 1.0)
                    }
                    errors.append(error)

        # Generate exercises for this user
        for _ in range(random.randint(5, 30)):
            category = random.choice(categories)
            subcategory = random.choice(subcategories[category])
            exercise = {
                "exercise_id": str(uuid.uuid4()),
                "user_id": user_id,
                "timestamp": random_timestamp(start_date, end_date),
                "category": category,
                "subcategory": subcategory,
                "format": random.choice(exercise_formats),
                "content_theme": random.choice(user["interests"]),
                "difficulty": random.uniform(0.1, 1.0),
                "user_performance": random.uniform(0.5, 1.0)
            }
            exercises.append(exercise)

    return {
        "users": users,
        "utterances": utterances,
        "errors": errors,
        "exercises": exercises
    }

# Generate the dataset
dataset = generate_granular_dataset()

# Save the dataset to separate JSON files
for key in dataset:
    with open(f'{key}.json', 'w') as f:
        json.dump(dataset[key], f, indent=2)

print("Dataset generated and saved to separate JSON files:")
print("users.json, utterances.json, errors.json, exercises.json")

# Print sample data
print("\nSample User:")
print(json.dumps(dataset["users"][0], indent=2))
print("\nSample Utterance:")
print(json.dumps(dataset["utterances"][0], indent=2))
print("\nSample Error:")
print(json.dumps(dataset["errors"][0], indent=2))
print("\nSample Exercise:")
print(json.dumps(dataset["exercises"][0], indent=2))

# convert to csv

df = pd.read_json('users.json')
df.to_csv('users.csv', index=False)

df = pd.read_json('utterances.json')
df.to_csv('utterances.csv', index=False)

df = pd.read_json('errors.json')
df.to_csv('errors.csv', index=False)

df = pd.read_json('exercises.json')
df.to_csv('exercises.csv', index=False)



Dataset generated and saved to separate JSON files:
users.json, utterances.json, errors.json, exercises.json

Sample User:
{
  "user_id": "4c67606a-5443-4ed9-9257-37aa4082de07",
  "country": "Japan",
  "age_band": "35-44",
  "proficiency_level": "Intermediate",
  "interests": [
    "anime",
    "sports",
    "music",
    "cooking",
    "literature"
  ]
}

Sample Utterance:
{
  "utterance_id": "054e3c91-d6ce-4083-bac9-e90c6f87876a",
  "user_id": "4c67606a-5443-4ed9-9257-37aa4082de07",
  "timestamp": "2024-09-26T06:30:04.585674",
  "text": "Sample text for utterance 0",
  "audio_file": "audio_054e3c91-d6ce-4083-bac9-e90c6f87876a.wav"
}

Sample Error:
{
  "error_id": "2ac32f5d-1d43-4b49-9ca5-67c9cee95e59",
  "utterance_id": "054e3c91-d6ce-4083-bac9-e90c6f87876a",
  "user_id": "4c67606a-5443-4ed9-9257-37aa4082de07",
  "timestamp": "2024-09-26T06:30:04.585674",
  "category": "fluency",
  "subcategory": "coherence",
  "severity": 0.46422879981624215
}

Sample Exercise:
{
  "exercise_id": "a2

In [12]:
import random
from datetime import datetime, timedelta
import uuid
import json

def random_timestamp(start, end):
    return (start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))).isoformat()

def generate_comprehensive_dataset(num_users=100, days_of_history=30):
    start_date = datetime.now() - timedelta(days=days_of_history)
    end_date = datetime.now()

    users = []
    utterances = []
    errors = []
    exercises = []
    user_progress = []

    countries = ["Japan", "India", "USA", "Brazil", "France", "Nigeria", "China", "Australia"]
    age_bands = ["18-24", "25-34", "35-44", "45-54", "55+"]
    proficiency_levels = ["Beginner", "Intermediate", "Advanced"]
    interests = ["anime", "technology", "cooking", "sports", "music", "travel", "movies", "literature"]
    
    categories = ["grammar", "vocabulary", "pronunciation", "fluency"]
    subcategories = {
        "grammar": ["subject_verb_agreement", "tense_usage", "article_usage", "preposition_usage"],
        "vocabulary": ["word_choice", "idiomatic_expressions", "collocations", "academic_vocabulary"],
        "pronunciation": ["consonant_sounds", "vowel_sounds", "word_stress", "intonation"],
        "fluency": ["speaking_speed", "pausing", "coherence", "filler_words"]
    }
    exercise_formats = ["fill-in-the-blank", "multiple-choice", "audio-recording", "sentence-formation"]
    cefr_levels = ["A1", "A2", "B1", "B2", "C1", "C2"]

    for i in range(num_users):
        user_id = str(uuid.uuid4())
        user = {
            "user_id": user_id,
            "country": random.choice(countries),
            "age_band": random.choice(age_bands),
            "proficiency_level": random.choice(proficiency_levels),
            "interests": random.sample(interests, random.randint(2, 5)),
            "cefr_level": random.choice(cefr_levels),
            "overall_progress": random.uniform(0, 1),
            "total_practice_time": random.randint(0, 100 * days_of_history),
            "exercises_completed": random.randint(0, 20 * days_of_history),
            "daily_active_days": random.randint(0, days_of_history),
            "weekly_active_days": random.randint(0, min(7, days_of_history)),
            "total_sessions": random.randint(0, 3 * days_of_history),
            "avg_session_duration": random.randint(5, 30),
            "current_focus": random.choice(categories),
            "adaptive_difficulty": random.uniform(0.1, 1.0),
            "personalized_goals": random.sample(categories, random.randint(1, 3)),
            "engagement_score": random.uniform(0, 1)
        }
        users.append(user)

        # Generate utterances for this user
        for _ in range(random.randint(10, 50)):
            utterance_id = str(uuid.uuid4())
            timestamp = random_timestamp(start_date, end_date)
            utterance = {
                "utterance_id": utterance_id,
                "user_id": user_id,
                "timestamp": timestamp,
                "uttered_text": f"Sample text for utterance {_}",
                "audio_file": f"audio_{utterance_id}.wav",
                "grammar_overall_score": random.uniform(0, 1),
                "grammar_error_count": random.randint(0, 5),
                "grammar_error_types": random.sample(subcategories["grammar"], random.randint(0, 2)),
                "vocabulary_overall_score": random.uniform(0, 1),
                "unique_words_count": random.randint(5, 20),
                "advanced_words_count": random.randint(0, 5),
                "lexical_density": random.uniform(0.4, 0.8),
                "pronunciation_overall_score": random.uniform(0, 1),
                "phoneme_error_count": random.randint(0, 5),
                "intonation_score": random.uniform(0, 1),
                "rhythm_score": random.uniform(0, 1),
                "accent_similarity": random.uniform(0, 1),
                "fluency_overall_score": random.uniform(0, 1),
                "words_per_minute": random.randint(60, 180),
                "filler_word_count": random.randint(0, 10),
                "pause_count": random.randint(0, 5),
                "avg_pause_duration": random.uniform(0.2, 2.0),
                "longest_fluent_segment": random.randint(5, 30),
                "coherence_score": random.uniform(0, 1),
                "topic_relevance_score": random.uniform(0, 1)
            }
            utterances.append(utterance)

            # Generate errors for this utterance
            for category in categories:
                if random.random() < 0.3:  # 30% chance of error in each category
                    subcategory = random.choice(subcategories[category])
                    error = {
                        "error_id": str(uuid.uuid4()),
                        "utterance_id": utterance_id,
                        "user_id": user_id,
                        "timestamp": timestamp,
                        "category": category,
                        "subcategory": subcategory,
                        "severity": random.uniform(0.3, 1.0)
                    }
                    errors.append(error)

        # Generate exercises for this user
        for _ in range(random.randint(5, 30)):
            exercise_id = str(uuid.uuid4())
            category = random.choice(categories)
            subcategory = random.choice(subcategories[category])
            exercise = {
                "exercise_id": exercise_id,
                "user_id": user_id,
                "timestamp": random_timestamp(start_date, end_date),
                "category": category,
                "subcategory": subcategory,
                "format": random.choice(exercise_formats),
                "content_theme": random.choice(user["interests"]),
                "difficulty": random.uniform(0.1, 1.0),
                "performance_score": random.uniform(0.5, 1.0),
                "time_taken": random.randint(30, 300)
            }
            exercises.append(exercise)

        # Generate user progress
        progress = {
            "user_id": user_id,
            "timestamp": end_date.isoformat(),
            "recent_grammar_errors": random.sample(subcategories["grammar"], random.randint(0, 2)),
            "vocab_improvement": random.uniform(-0.1, 0.2),
            "fluency_trend": random.uniform(-0.1, 0.2),
            "engagement_score": random.uniform(0, 1)
        }
        user_progress.append(progress)

    return {
        "users": users,
        "utterances": utterances,
        "errors": errors,
        "exercises": exercises,
        "user_progress": user_progress
    }

# Generate the dataset
dataset = generate_comprehensive_dataset()

# Save the dataset to separate JSON files
for key in dataset:
    with open(f'final_data/{key}.json', 'w') as f:
        json.dump(dataset[key], f, indent=2)

print("Comprehensive dataset generated and saved to separate JSON files:")
print("users.json, utterances.json, errors.json, exercises.json, user_progress.json")

# Print sample data
print("\nSample User:")
print(json.dumps(dataset["users"][0], indent=2))
print("\nSample Utterance:")
print(json.dumps(dataset["utterances"][0], indent=2))
print("\nSample Error:")
print(json.dumps(dataset["errors"][0], indent=2))
print("\nSample Exercise:")
print(json.dumps(dataset["exercises"][0], indent=2))
print("\nSample User Progress:")
print(json.dumps(dataset["user_progress"][0], indent=2))

# convert to csv

df = pd.read_json('final_data/users.json')
df.to_csv('users.csv', index=False)

df = pd.read_json('final_data/utterances.json')
df.to_csv('utterances.csv', index=False)

df = pd.read_json('final_data/errors.json')
df.to_csv('errors.csv', index=False)

df = pd.read_json('final_data/exercises.json')
df.to_csv('exercises.csv', index=False)

df = pd.read_json('final_data/user_progress.json')
df.to_csv('user_progress.csv', index=False)



Comprehensive dataset generated and saved to separate JSON files:
users.json, utterances.json, errors.json, exercises.json, user_progress.json

Sample User:
{
  "user_id": "e0fa472b-9738-4814-b366-5e0f6f668547",
  "country": "France",
  "age_band": "55+",
  "proficiency_level": "Intermediate",
  "interests": [
    "cooking",
    "music",
    "literature",
    "anime",
    "movies"
  ],
  "cefr_level": "B1",
  "overall_progress": 0.8568946970619489,
  "total_practice_time": 143,
  "exercises_completed": 362,
  "daily_active_days": 15,
  "weekly_active_days": 6,
  "total_sessions": 76,
  "avg_session_duration": 12,
  "current_focus": "fluency",
  "adaptive_difficulty": 0.8417098175214394,
  "personalized_goals": [
    "pronunciation",
    "fluency",
    "vocabulary"
  ],
  "engagement_score": 0.7072959300474474
}

Sample Utterance:
{
  "utterance_id": "8d782f91-9708-4bb0-8bc7-479510f27f89",
  "user_id": "e0fa472b-9738-4814-b366-5e0f6f668547",
  "timestamp": "2024-09-20T21:10:40.351897",
