# Data Cleaning Workflow with DataDojo

This notebook demonstrates how to create and execute a complete data cleaning pipeline using DataDojo's guided approach.

In [None]:
import pandas as pd
import numpy as np
from datadojo import create_dojo
from datadojo.contracts.dojo_interface import Domain, Difficulty, GuidanceLevel

# Initialize DataDojo
dojo = create_dojo()

## Step 1: Create Sample Data

Let's create a messy dataset that needs cleaning.

In [None]:
# Create a dataset with common data quality issues
np.random.seed(42)

data = {
    'customer_id': list(range(1, 101)) + [50],  # Duplicate
    'age': [np.random.randint(18, 80) if i % 10 != 0 else None for i in range(101)],  # Missing values
    'income': [np.random.randint(20000, 150000) for _ in range(101)],
    'purchase_amount': [np.random.uniform(10, 1000) if i % 15 != 0 else None for i in range(101)],  # Missing
    'category': np.random.choice(['Electronics', 'Clothing', 'Food', 'electronics'], 101),  # Inconsistent
    'email': [f'user{i}@email.com' if i % 20 != 0 else 'invalid' for i in range(101)]  # Invalid values
}

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
df.head(10)

## Step 2: Start a Data Cleaning Project

In [None]:
# Get an e-commerce project
projects = dojo.list_projects(domain=Domain.ECOMMERCE, difficulty=Difficulty.BEGINNER)

if projects:
    project = dojo.start_project(projects[0].id)
    print(f"ðŸ“Š Project: {project.name}")
    print(f"Description: {project.description}")
else:
    print("Creating a custom project...")
    # In production, you'd use dojo.create_project() here

## Step 3: Create a Data Cleaning Pipeline

We'll build a pipeline with multiple processing steps.

In [None]:
from datadojo.contracts.dojo_interface import OperationType

# Create a pipeline with detailed guidance
if projects:
    pipeline = project.create_pipeline(
        "customer_data_cleaning",
        guidance_level=GuidanceLevel.DETAILED
    )
    
    # Add steps to the pipeline
    pipeline.add_step(
        step_id="remove_duplicates",
        name="Remove Duplicate Records",
        operation_type=OperationType.DATA_CLEANING,
        description="Identify and remove duplicate customer records",
        learned_concepts=["duplicates"]
    )
    
    pipeline.add_step(
        step_id="handle_missing_age",
        name="Handle Missing Age Values",
        operation_type=OperationType.DATA_CLEANING,
        description="Fill or remove missing age values",
        learned_concepts=["missing_values"],
        prerequisites=["remove_duplicates"]
    )
    
    pipeline.add_step(
        step_id="standardize_categories",
        name="Standardize Category Names",
        operation_type=OperationType.TRANSFORMATION,
        description="Make category names consistent",
        learned_concepts=["data_quality"],
        prerequisites=["handle_missing_age"]
    )
    
    print("âœ… Pipeline created with 3 steps")

## Step 4: Learn About the Concepts

Before processing, let's understand the concepts.

In [None]:
educational = dojo.get_educational_interface()

# Learn about duplicates
duplicates_concept = educational.get_concept_explanation("duplicates")
if duplicates_concept:
    print(f"ðŸ“š {duplicates_concept.title}\n")
    print(duplicates_concept.get_summary(max_length=200))
    
    if duplicates_concept.examples:
        print(f"\nðŸ’» Example:\n{duplicates_concept.examples[0]}")

## Step 5: Implement the Cleaning Steps

Now let's actually clean the data, step by step.

In [None]:
# Step 1: Remove duplicates
print("Step 1: Remove Duplicates")
print(f"Before: {len(df)} rows")
df_clean = df.drop_duplicates(subset=['customer_id'], keep='first')
print(f"After: {len(df_clean)} rows")
print(f"Removed {len(df) - len(df_clean)} duplicate(s)\n")

In [None]:
# Step 2: Handle missing age values
print("Step 2: Handle Missing Age Values")
print(f"Missing age values: {df_clean['age'].isnull().sum()}")

# Fill with median
median_age = df_clean['age'].median()
df_clean['age'] = df_clean['age'].fillna(median_age)
print(f"Filled with median age: {median_age}")
print(f"Missing values after: {df_clean['age'].isnull().sum()}\n")

In [None]:
# Step 3: Standardize categories
print("Step 3: Standardize Category Names")
print(f"Categories before: {df_clean['category'].unique()}")

# Convert to lowercase and capitalize first letter
df_clean['category'] = df_clean['category'].str.lower().str.capitalize()
print(f"Categories after: {df_clean['category'].unique()}\n")

## Step 6: Validate the Results

In [None]:
print("Data Quality Report:")
print(f"  Total rows: {len(df_clean)}")
print(f"  Total columns: {len(df_clean.columns)}")
print(f"  Missing values: {df_clean.isnull().sum().sum()}")
print(f"  Duplicate rows: {df_clean.duplicated().sum()}")
print(f"\nCleaned data preview:")
df_clean.head()

## Step 7: Track Your Progress

DataDojo tracks your completed steps and learned concepts.

In [None]:
# Track progress
progress = educational.get_progress("student-1", project.id if projects else "custom-project")

# Mark steps as completed
progress.complete_step("remove_duplicates")
progress.complete_step("handle_missing_age")
progress.complete_step("standardize_categories")

# Mark concepts as learned
progress.learn_concept("duplicates")
progress.learn_concept("missing_values")
progress.learn_concept("data_quality")

# Update skill scores
progress.update_skill_score("data_cleaning", 85.0)

print(f"Progress Summary:")
print(f"  Completed steps: {len(progress.completed_steps)}")
print(f"  Learned concepts: {len(progress.learned_concepts)}")
print(f"  Average skill score: {progress.get_average_skill_score():.1f}%")

## Summary

In this notebook, you learned how to:

âœ… Create a data cleaning pipeline
âœ… Handle common data quality issues
âœ… Use educational concepts to understand the process
âœ… Track your learning progress

Next steps:
- Try the **03_progress_tracking.ipynb** notebook to visualize your learning journey
- Explore **04_custom_pipelines.ipynb** to build more advanced pipelines