In [1]:
import json

response ={
  "id": "msg_01RKSsKxVUKjhtjZd1VhvfgN",
  "type": "message",
  "role": "assistant",
  "model": "claude-opus-4-5-20251101",
  "content": [
    {
      "type": "text",
      "text": "# MLflow Dataset Versioning and Tracking Lab\n\n```python\n# %% [markdown]\n# # üß™ MLflow Dataset Versioning and Tracking Lab\n# \n# ## Learning Objectives\n# By the end of this lab, you will be able to:\n# - Understand MLflow's dataset tracking capabilities\n# - Version different types of datasets (CSV, Images, JSON, Parquet)\n# - Track dataset metadata and lineage\n# - Compare dataset versions across experiments\n# \n# ## Prerequisites\n# - Python 3.8+\n# - MLflow installed\n# - Basic understanding of machine learning workflows\n\n# %% [markdown]\n# ## üì¶ Step 1: Environment Setup\n# \n# First, let's install and import all necessary libraries.\n# Run the cell below to set up your environment.\n\n# %%\n# Install required packages (uncomment if needed)\n# !pip install mlflow pandas numpy scikit-learn pillow pyarrow boto3\n\n# %%\n# Import required libraries\nimport mlflow\nfrom mlflow.data.pandas_dataset import PandasDataset\nfrom mlflow.data.numpy_dataset import NumpyDataset\nimport mlflow.data as mlflow_data\n\nimport pandas as pd\nimport numpy as np\nimport json\nimport os\nimport shutil\nfrom datetime import datetime\nfrom PIL import Image\nimport hashlib\nfrom sklearn.datasets import load_iris, load_wine, fetch_california_housing\nfrom sklearn.model_selection import train_test_split\nimport warnings\nwarnings.filterwarnings('ignore')\n\nprint(\"‚úÖ All libraries imported successfully!\")\nprint(f\"MLflow version: {mlflow.__version__}\")\n\n# %% [markdown]\n# ## üîß Step 2: MLflow Configuration\n# \n# Configure MLflow to connect to your tracking server.\n# \n# **Important:** Make sure your MLflow server is running at the specified URI.\n# \n# To start a local MLflow server, run in terminal:\n# ```bash\n# mlflow server --host 0.0.0.0 --port 5000\n# ```\n\n# %%\n# MLflow Configuration\nMLFLOW_TRACKING_URI = \"http://localhost:5000\"\n\n# Set the tracking URI\nmlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n\n# Verify connection\ntry:\n    client = mlflow.tracking.MlflowClient()\n    experiments = client.search_experiments()\n    print(f\"‚úÖ Successfully connected to MLflow at: {MLFLOW_TRACKING_URI}\")\n    print(f\"üìä Number of existing experiments: {len(experiments)}\")\nexcept Exception as e:\n    print(f\"‚ùå Failed to connect to MLflow: {e}\")\n    print(\"üí° Tip: Make sure MLflow server is running!\")\n\n# %% [markdown]\n# ## üìÅ Step 3: Create Working Directories\n# \n# We'll create directories to store our sample datasets.\n\n# %%\n# Create directories for datasets\nDATASET_DIR = \"./mlflow_datasets_lab\"\nCSV_DIR = os.path.join(DATASET_DIR, \"csv\")\nIMAGE_DIR = os.path.join(DATASET_DIR, \"images\")\nJSON_DIR = os.path.join(DATASET_DIR, \"json\")\nPARQUET_DIR = os.path.join(DATASET_DIR, \"parquet\")\n\n# Create all directories\nfor directory in [DATASET_DIR, CSV_DIR, IMAGE_DIR, JSON_DIR, PARQUET_DIR]:\n    os.makedirs(directory, exist_ok=True)\n    print(f\"üìÇ Created directory: {directory}\")\n\nprint(\"\\n‚úÖ All directories created successfully!\")\n\n# %% [markdown]\n# ---\n# # üìä PART 1: CSV Dataset Versioning\n# ---\n# \n# ## Step 4: Create and Version CSV Datasets\n# \n# In this section, we'll learn how to:\n# 1. Create sample CSV datasets\n# 2. Track them with MLflow\n# 3. Create multiple versions\n# 4. Compare versions\n\n# %% [markdown]\n# ### 4.1 Create Sample CSV Datasets\n# \n# We'll create a customer dataset with multiple versions simulating\n# data updates over time.\n\n# %%\ndef create_customer_dataset_v1():\n    \"\"\"\n    Create Version 1 of customer dataset.\n    This represents our initial data collection.\n    \"\"\"\n    np.random.seed(42)\n    n_samples = 1000\n    \n    data = {\n        'customer_id': range(1, n_samples + 1),\n        'age': np.random.randint(18, 70, n_samples),\n        'income': np.random.normal(50000, 15000, n_samples).astype(int),\n        'spending_score': np.random.randint(1, 100, n_samples),\n        'membership_years': np.random.randint(0, 10, n_samples),\n        'churn': np.random.choice([0, 1], n_samples, p=[0.8, 0.2])\n    }\n    \n    df = pd.DataFrame(data)\n    return df\n\ndef create_customer_dataset_v2():\n    \"\"\"\n    Create Version 2 of customer dataset.\n    Simulates data update with:\n    - More samples\n    - New feature (region)\n    - Updated values\n    \"\"\"\n    np.random.seed(43)\n    n_samples = 1500  # More data collected\n    \n    data = {\n        'customer_id': range(1, n_samples + 1),\n        'age': np.random.randint(18, 75, n_samples),\n        'income': np.random.normal(55000, 18000, n_samples).astype(int),\n        'spending_score': np.random.randint(1, 100, n_samples),\n        'membership_years': np.random.randint(0, 15, n_samples),\n        'region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),\n        'churn': np.random.choice([0, 1], n_samples, p=[0.75, 0.25])\n    }\n    \n    df = pd.DataFrame(data)\n    return df\n\n# Create datasets\ndf_v1 = create_customer_dataset_v1()\ndf_v2 = create_customer_dataset_v2()\n\n# Save to CSV files\ncsv_path_v1 = os.path.join(CSV_DIR, \"customers_v1.csv\")\ncsv_path_v2 = os.path.join(CSV_DIR, \"customers_v2.csv\")\n\ndf_v1.to_csv(csv_path_v1, index=False)\ndf_v2.to_csv(csv_path_v2, index=False)\n\nprint(\"üìä Customer Dataset Version 1:\")\nprint(f\"   Shape: {df_v1.shape}\")\nprint(f\"   Columns: {list(df_v1.columns)}\")\nprint(f\"   Saved to: {csv_path_v1}\")\n\nprint(\"\\nüìä Customer Dataset Version 2:\")\nprint(f\"   Shape: {df_v2.shape}\")\nprint(f\"   Columns: {list(df_v2.columns)}\")\nprint(f\"   Saved to: {csv_path_v2}\")\n\n# %% [markdown]\n# ### 4.2 Track CSV Dataset with MLflow\n# \n# Now let's create an MLflow experiment and log our CSV datasets\n# with proper versioning and metadata.\n\n# %%\ndef calculate_file_hash(filepath):\n    \"\"\"Calculate MD5 hash of a file for versioning.\"\"\"\n    hash_md5 = hashlib.md5()\n    with open(filepath, \"rb\") as f:\n        for chunk in iter(lambda: f.read(4096), b\"\"):\n            hash_md5.update(chunk)\n    return hash_md5.hexdigest()\n\ndef get_dataset_stats(df):\n    \"\"\"Generate statistics for a DataFrame.\"\"\"\n    stats = {\n        \"num_rows\": len(df),\n        \"num_columns\": len(df.columns),\n        \"columns\": list(df.columns),\n        \"dtypes\": {col: str(dtype) for col, dtype in df.dtypes.items()},\n        \"missing_values\": df.isnull().sum().to_dict(),\n        \"memory_usage_mb\": df.memory_usage(deep=True).sum() / (1024 * 1024)\n    }\n    \n    # Add numeric column statistics\n    numeric_cols = df.select_dtypes(include=[np.number]).columns\n    if len(numeric_cols) > 0:\n        stats[\"numeric_stats\"] = df[numeric_cols].describe().to_dict()\n    \n    return stats\n\nprint(\"‚úÖ Helper functions created!\")\n\n# %%\n# Create experiment for CSV datasets\nEXPERIMENT_NAME_CSV = \"CSV_Dataset_Versioning_Lab\"\n\n# Set or create experiment\nmlflow.set_experiment(EXPERIMENT_NAME_CSV)\nexperiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME_CSV)\nprint(f\"üìÅ Experiment: {EXPERIMENT_NAME_CSV}\")\nprint(f\"   Experiment ID: {experiment.experiment_id}\")\n\n# %%\n# Log CSV Dataset Version 1\nprint(\"üîÑ Logging CSV Dataset Version 1...\")\n\nwith mlflow.start_run(run_name=\"customer_dataset_v1\") as run:\n    # Load the dataset for MLflow\n    df = pd.read_csv(csv_path_v1)\n    \n    # Create MLflow dataset\n    dataset = mlflow.data.from_pandas(\n        df, \n        source=csv_path_v1,\n        name=\"customer_churn_dataset\",\n        targets=\"churn\"\n    )\n    \n    # Log the dataset\n    mlflow.log_input(dataset, context=\"training\")\n    \n    # Log dataset metadata as parameters\n    mlflow.log_param(\"dataset_version\", \"1.0.0\")\n    mlflow.log_param(\"dataset_name\", \"customer_churn\")\n    mlflow.log_param(\"data_source\", \"synthetic\")\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    \n    # Log dataset statistics as metrics\n    stats = get_dataset_stats(df)\n    mlflow.log_metric(\"num_rows\", stats[\"num_rows\"])\n    mlflow.log_metric(\"num_columns\", stats[\"num_columns\"])\n    mlflow.log_metric(\"memory_mb\", stats[\"memory_usage_mb\"])\n    mlflow.log_metric(\"missing_total\", sum(stats[\"missing_values\"].values()))\n    mlflow.log_metric(\"churn_rate\", df['churn'].mean())\n    \n    # Log the actual CSV file as artifact\n    mlflow.log_artifact(csv_path_v1, artifact_path=\"datasets\")\n    \n    # Log schema information\n    schema_info = {\n        \"columns\": stats[\"columns\"],\n        \"dtypes\": stats[\"dtypes\"],\n        \"version\": \"1.0.0\"\n    }\n    schema_path = os.path.join(CSV_DIR, \"schema_v1.json\")\n    with open(schema_path, 'w') as f:\n        json.dump(schema_info, f, indent=2)\n    mlflow.log_artifact(schema_path, artifact_path=\"schema\")\n    \n    # Log file hash for integrity\n    file_hash = calculate_file_hash(csv_path_v1)\n    mlflow.log_param(\"file_hash_md5\", file_hash)\n    \n    run_id_v1 = run.info.run_id\n    print(f\"‚úÖ Dataset v1 logged successfully!\")\n    print(f\"   Run ID: {run_id_v1}\")\n    print(f\"   File Hash: {file_hash}\")\n\n# %%\n# Log CSV Dataset Version 2\nprint(\"üîÑ Logging CSV Dataset Version 2...\")\n\nwith mlflow.start_run(run_name=\"customer_dataset_v2\") as run:\n    # Load the dataset\n    df = pd.read_csv(csv_path_v2)\n    \n    # Create MLflow dataset\n    dataset = mlflow.data.from_pandas(\n        df, \n        source=csv_path_v2,\n        name=\"customer_churn_dataset\",\n        targets=\"churn\"\n    )\n    \n    # Log the dataset\n    mlflow.log_input(dataset, context=\"training\")\n    \n    # Log dataset metadata\n    mlflow.log_param(\"dataset_version\", \"2.0.0\")\n    mlflow.log_param(\"dataset_name\", \"customer_churn\")\n    mlflow.log_param(\"data_source\", \"synthetic\")\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    mlflow.log_param(\"changes_from_v1\", \"Added region column, more samples\")\n    \n    # Log statistics\n    stats = get_dataset_stats(df)\n    mlflow.log_metric(\"num_rows\", stats[\"num_rows\"])\n    mlflow.log_metric(\"num_columns\", stats[\"num_columns\"])\n    mlflow.log_metric(\"memory_mb\", stats[\"memory_usage_mb\"])\n    mlflow.log_metric(\"missing_total\", sum(stats[\"missing_values\"].values()))\n    mlflow.log_metric(\"churn_rate\", df['churn'].mean())\n    \n    # Log artifact\n    mlflow.log_artifact(csv_path_v2, artifact_path=\"datasets\")\n    \n    # Log schema\n    schema_info = {\n        \"columns\": stats[\"columns\"],\n        \"dtypes\": stats[\"dtypes\"],\n        \"version\": \"2.0.0\"\n    }\n    schema_path = os.path.join(CSV_DIR, \"schema_v2.json\")\n    with open(schema_path, 'w') as f:\n        json.dump(schema_info, f, indent=2)\n    mlflow.log_artifact(schema_path, artifact_path=\"schema\")\n    \n    file_hash = calculate_file_hash(csv_path_v2)\n    mlflow.log_param(\"file_hash_md5\", file_hash)\n    \n    run_id_v2 = run.info.run_id\n    print(f\"‚úÖ Dataset v2 logged successfully!\")\n    print(f\"   Run ID: {run_id_v2}\")\n    print(f\"   File Hash: {file_hash}\")\n\n# %% [markdown]\n# ### 4.3 Compare CSV Dataset Versions\n# \n# Let's retrieve and compare the two versions we logged.\n\n# %%\n# Compare dataset versions\nprint(\"üìä Comparing Dataset Versions:\")\nprint(\"=\" * 60)\n\nclient = mlflow.tracking.MlflowClient()\n\n# Get runs from the experiment\nruns = client.search_runs(\n    experiment_ids=[experiment.experiment_id],\n    order_by=[\"start_time DESC\"]\n)\n\ncomparison_data = []\nfor run in runs:\n    if \"customer_dataset\" in run.info.run_name:\n        version_info = {\n            \"Run Name\": run.info.run_name,\n            \"Version\": run.data.params.get(\"dataset_version\", \"N/A\"),\n            \"Rows\": run.data.metrics.get(\"num_rows\", \"N/A\"),\n            \"Columns\": run.data.metrics.get(\"num_columns\", \"N/A\"),\n            \"Churn Rate\": f\"{run.data.metrics.get('churn_rate', 0):.2%}\",\n            \"Memory (MB)\": f\"{run.data.metrics.get('memory_mb', 0):.2f}\"\n        }\n        comparison_data.append(version_info)\n\ncomparison_df = pd.DataFrame(comparison_data)\nprint(comparison_df.to_string(index=False))\n\n# %% [markdown]\n# ---\n# # üñºÔ∏è PART 2: Image Dataset Versioning\n# ---\n# \n# ## Step 5: Create and Version Image Datasets\n# \n# In this section, we'll learn how to:\n# 1. Generate synthetic image datasets\n# 2. Track image datasets with metadata\n# 3. Log image statistics and samples\n\n# %% [markdown]\n# ### 5.1 Create Synthetic Image Dataset\n# \n# We'll create a simple image classification dataset with\n# geometric shapes (circles, squares, triangles).\n\n# %%\ndef create_shape_image(shape, size=64, color=None):\n    \"\"\"Create an image with a specific shape.\"\"\"\n    if color is None:\n        color = tuple(np.random.randint(100, 255, 3))\n    \n    # Create a white background\n    img = Image.new('RGB', (size, size), color=(255, 255, 255))\n    pixels = img.load()\n    \n    center = size // 2\n    radius = size // 3\n    \n    if shape == 'circle':\n        for x in range(size):\n            for y in range(size):\n                if (x - center) ** 2 + (y - center) ** 2 <= radius ** 2:\n                    pixels[x, y] = color\n                    \n    elif shape == 'square':\n        for x in range(center - radius, center + radius):\n            for y in range(center - radius, center + radius):\n                if 0 <= x < size and 0 <= y < size:\n                    pixels[x, y] = color\n                    \n    elif shape == 'triangle':\n        for y in range(center - radius, center + radius):\n            width = int((y - (center - radius)) * radius / radius)\n            for x in range(center - width, center + width):\n                if 0 <= x < size and 0 <= y < size:\n                    pixels[x, y] = color\n    \n    return img\n\ndef create_image_dataset(version, num_per_class=50):\n    \"\"\"Create a complete image dataset with multiple classes.\"\"\"\n    shapes = ['circle', 'square', 'triangle']\n    dataset_info = {\n        'version': version,\n        'classes': shapes,\n        'images_per_class': num_per_class,\n        'total_images': num_per_class * len(shapes),\n        'image_size': 64,\n        'files': []\n    }\n    \n    version_dir = os.path.join(IMAGE_DIR, f\"v{version}\")\n    os.makedirs(version_dir, exist_ok=True)\n    \n    for shape in shapes:\n        class_dir = os.path.join(version_dir, shape)\n        os.makedirs(class_dir, exist_ok=True)\n        \n        for i in range(num_per_class):\n            img = create_shape_image(shape)\n            filename = f\"{shape}_{i:04d}.png\"\n            filepath = os.path.join(class_dir, filename)\n            img.save(filepath)\n            dataset_info['files'].append({\n                'path': filepath,\n                'class': shape,\n                'filename': filename\n            })\n    \n    return dataset_info, version_dir\n\nprint(\"‚úÖ Image generation functions created!\")\n\n# %%\n# Create Image Dataset Version 1 (smaller dataset)\nprint(\"üñºÔ∏è Creating Image Dataset Version 1...\")\nimage_info_v1, image_dir_v1 = create_image_dataset(version=1, num_per_class=30)\nprint(f\"   Total images: {image_info_v1['total_images']}\")\nprint(f\"   Classes: {image_info_v1['classes']}\")\nprint(f\"   Location: {image_dir_v1}\")\n\n# Create Image Dataset Version 2 (larger dataset)\nprint(\"\\nüñºÔ∏è Creating Image Dataset Version 2...\")\nimage_info_v2, image_dir_v2 = create_image_dataset(version=2, num_per_class=50)\nprint(f\"   Total images: {image_info_v2['total_images']}\")\nprint(f\"   Classes: {image_info_v2['classes']}\")\nprint(f\"   Location: {image_dir_v2}\")\n\n# %% [markdown]\n# ### 5.2 Track Image Dataset with MLflow\n# \n# Now let's log our image datasets with proper tracking and metadata.\n\n# %%\ndef get_image_dataset_stats(dataset_dir):\n    \"\"\"Calculate statistics for an image dataset.\"\"\"\n    stats = {\n        'total_images': 0,\n        'total_size_mb': 0,\n        'classes': {},\n        'image_dimensions': set()\n    }\n    \n    for class_name in os.listdir(dataset_dir):\n        class_path = os.path.join(dataset_dir, class_name)\n        if os.path.isdir(class_path):\n            class_images = [f for f in os.listdir(class_path) if f.endswith(('.png', '.jpg', '.jpeg'))]\n            stats['classes'][class_name] = len(class_images)\n            stats['total_images'] += len(class_images)\n            \n            for img_file in class_images[:5]:  # Check first 5 images\n                img_path = os.path.join(class_path, img_file)\n                stats['total_size_mb'] += os.path.getsize(img_path) / (1024 * 1024)\n                \n                with Image.open(img_path) as img:\n                    stats['image_dimensions'].add(img.size)\n    \n    stats['image_dimensions'] = list(stats['image_dimensions'])\n    return stats\n\n# Create experiment for image datasets\nEXPERIMENT_NAME_IMAGES = \"Image_Dataset_Versioning_Lab\"\nmlflow.set_experiment(EXPERIMENT_NAME_IMAGES)\n\nprint(f\"üìÅ Experiment: {EXPERIMENT_NAME_IMAGES}\")\n\n# %%\n# Log Image Dataset Version 1\nprint(\"üîÑ Logging Image Dataset Version 1...\")\n\nwith mlflow.start_run(run_name=\"shapes_dataset_v1\") as run:\n    # Calculate statistics\n    stats = get_image_dataset_stats(image_dir_v1)\n    \n    # Log parameters\n    mlflow.log_param(\"dataset_version\", \"1.0.0\")\n    mlflow.log_param(\"dataset_name\", \"geometric_shapes\")\n    mlflow.log_param(\"image_format\", \"PNG\")\n    mlflow.log_param(\"image_size\", \"64x64\")\n    mlflow.log_param(\"num_classes\", len(stats['classes']))\n    mlflow.log_param(\"class_names\", list(stats['classes'].keys()))\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    \n    # Log metrics\n    mlflow.log_metric(\"total_images\", stats['total_images'])\n    mlflow.log_metric(\"total_size_mb\", stats['total_size_mb'])\n    \n    for class_name, count in stats['classes'].items():\n        mlflow.log_metric(f\"class_{class_name}_count\", count)\n    \n    # Log sample images from each class\n    for class_name in stats['classes'].keys():\n        class_path = os.path.join(image_dir_v1, class_name)\n        sample_images = os.listdir(class_path)[:3]  # Log 3 samples per class\n        \n        for img_name in sample_images:\n            img_path = os.path.join(class_path, img_name)\n            mlflow.log_artifact(img_path, artifact_path=f\"samples/{class_name}\")\n    \n    # Log dataset info as JSON\n    info_path = os.path.join(image_dir_v1, \"dataset_info.json\")\n    with open(info_path, 'w') as f:\n        json.dump({\n            **image_info_v1,\n            'stats': {k: v for k, v in stats.items() if k != 'image_dimensions'}\n        }, f, indent=2, default=str)\n    mlflow.log_artifact(info_path, artifact_path=\"metadata\")\n    \n    run_id_img_v1 = run.info.run_id\n    print(f\"‚úÖ Image Dataset v1 logged!\")\n    print(f\"   Run ID: {run_id_img_v1}\")\n    print(f\"   Total Images: {stats['total_images']}\")\n\n# %%\n# Log Image Dataset Version 2\nprint(\"üîÑ Logging Image Dataset Version 2...\")\n\nwith mlflow.start_run(run_name=\"shapes_dataset_v2\") as run:\n    # Calculate statistics\n    stats = get_image_dataset_stats(image_dir_v2)\n    \n    # Log parameters\n    mlflow.log_param(\"dataset_version\", \"2.0.0\")\n    mlflow.log_param(\"dataset_name\", \"geometric_shapes\")\n    mlflow.log_param(\"image_format\", \"PNG\")\n    mlflow.log_param(\"image_size\", \"64x64\")\n    mlflow.log_param(\"num_classes\", len(stats['classes']))\n    mlflow.log_param(\"class_names\", list(stats['classes'].keys()))\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    mlflow.log_param(\"changes_from_v1\", \"Increased samples per class\")\n    \n    # Log metrics\n    mlflow.log_metric(\"total_images\", stats['total_images'])\n    mlflow.log_metric(\"total_size_mb\", stats['total_size_mb'])\n    \n    for class_name, count in stats['classes'].items():\n        mlflow.log_metric(f\"class_{class_name}_count\", count)\n    \n    # Log sample images\n    for class_name in stats['classes'].keys():\n        class_path = os.path.join(image_dir_v2, class_name)\n        sample_images = os.listdir(class_path)[:3]\n        \n        for img_name in sample_images:\n            img_path = os.path.join(class_path, img_name)\n            mlflow.log_artifact(img_path, artifact_path=f\"samples/{class_name}\")\n    \n    # Log dataset info\n    info_path = os.path.join(image_dir_v2, \"dataset_info.json\")\n    with open(info_path, 'w') as f:\n        json.dump({\n            **image_info_v2,\n            'stats': {k: v for k, v in stats.items() if k != 'image_dimensions'}\n        }, f, indent=2, default=str)\n    mlflow.log_artifact(info_path, artifact_path=\"metadata\")\n    \n    run_id_img_v2 = run.info.run_id\n    print(f\"‚úÖ Image Dataset v2 logged!\")\n    print(f\"   Run ID: {run_id_img_v2}\")\n    print(f\"   Total Images: {stats['total_images']}\")\n\n# %% [markdown]\n# ---\n# # üìã PART 3: JSON Dataset Versioning\n# ---\n# \n# ## Step 6: Create and Version JSON Datasets\n# \n# JSON datasets are common for NLP tasks, configuration data,\n# and semi-structured data. Let's learn how to version them.\n\n# %% [markdown]\n# ### 6.1 Create Sample JSON Datasets\n\n# %%\ndef create_text_classification_dataset(version, num_samples=100):\n    \"\"\"Create a text classification dataset in JSON format.\"\"\"\n    \n    categories = ['sports', 'technology', 'politics', 'entertainment']\n    \n    sample_texts = {\n        'sports': [\n            \"The team won the championship game last night\",\n            \"Athletes prepare for the upcoming Olympics\",\n            \"Football season starts next month\",\n            \"The basketball player scored 30 points\",\n            \"Tennis tournament reaches final stages\"\n        ],\n        'technology': [\n            \"New smartphone features advanced AI capabilities\",\n            \"Tech company releases latest software update\",\n            \"Scientists develop revolutionary battery technology\",\n            \"Cloud computing transforms business operations\",\n            \"Artificial intelligence continues to evolve\"\n        ],\n        'politics': [\n            \"Government announces new economic policies\",\n            \"Elections scheduled for next year\",\n            \"International summit addresses climate change\",\n            \"New legislation passes in parliament\",\n            \"Diplomatic relations improve between nations\"\n        ],\n        'entertainment': [\n            \"Movie breaks box office records\",\n            \"Popular singer announces world tour\",\n            \"Streaming service adds new content\",\n            \"Award show celebrates best performances\",\n            \"New video game receives critical acclaim\"\n        ]\n    }\n    \n    data = []\n    for i in range(num_samples):\n        category = np.random.choice(categories)\n        text = np.random.choice(sample_texts[category])\n        \n        # Add some variation\n        text = text + f\" (Sample {i})\"\n        \n        data.append({\n            'id': i,\n            'text': text,\n            'label': category,\n            'timestamp': datetime.now().isoformat(),\n            'metadata': {\n                'source': 'synthetic',\n                'version': version,\n                'confidence': round(np.random.uniform(0.7, 1.0), 2)\n            }\n        })\n    \n    return data\n\n# Create JSON datasets\njson_data_v1 = create_text_classification_dataset(version=1, num_samples=100)\njson_data_v2 = create_text_classification_dataset(version=2, num_samples=200)\n\n# Save to files\njson_path_v1 = os.path.join(JSON_DIR, \"text_classification_v1.json\")\njson_path_v2 = os.path.join(JSON_DIR, \"text_classification_v2.json\")\n\nwith open(json_path_v1, 'w') as f:\n    json.dump(json_data_v1, f, indent=2)\n\nwith open(json_path_v2, 'w') as f:\n    json.dump(json_data_v2, f, indent=2)\n\nprint(\"üìã JSON Dataset Version 1:\")\nprint(f\"   Samples: {len(json_data_v1)}\")\nprint(f\"   Saved to: {json_path_v1}\")\n\nprint(\"\\nüìã JSON Dataset Version 2:\")\nprint(f\"   Samples: {len(json_data_v2)}\")\nprint(f\"   Saved to: {json_path_v2}\")\n\n# %% [markdown]\n# ### 6.2 Track JSON Dataset with MLflow\n\n# %%\ndef get_json_dataset_stats(json_data):\n    \"\"\"Calculate statistics for JSON dataset.\"\"\"\n    df = pd.DataFrame(json_data)\n    \n    stats = {\n        'total_samples': len(json_data),\n        'label_distribution': df['label'].value_counts().to_dict(),\n        'avg_text_length': df['text'].str.len().mean(),\n        'unique_labels': df['label'].nunique()\n    }\n    \n    return stats\n\n# Create experiment for JSON datasets\nEXPERIMENT_NAME_JSON = \"JSON_Dataset_Versioning_Lab\"\nmlflow.set_experiment(EXPERIMENT_NAME_JSON)\n\nprint(f\"üìÅ Experiment: {EXPERIMENT_NAME_JSON}\")\n\n# %%\n# Log JSON Dataset Version 1\nprint(\"üîÑ Logging JSON Dataset Version 1...\")\n\nwith mlflow.start_run(run_name=\"text_classification_v1\") as run:\n    # Load and analyze\n    with open(json_path_v1, 'r') as f:\n        data = json.load(f)\n    \n    stats = get_json_dataset_stats(data)\n    \n    # Log parameters\n    mlflow.log_param(\"dataset_version\", \"1.0.0\")\n    mlflow.log_param(\"dataset_name\", \"text_classification\")\n    mlflow.log_param(\"data_format\", \"JSON\")\n    mlflow.log_param(\"task_type\", \"multi-class classification\")\n    mlflow.log_param(\"num_classes\", stats['unique_labels'])\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    \n    # Log metrics\n    mlflow.log_metric(\"total_samples\", stats['total_samples'])\n    mlflow.log_metric(\"avg_text_length\", stats['avg_text_length'])\n    mlflow.log_metric(\"num_classes\", stats['unique_labels'])\n    \n    for label, count in stats['label_distribution'].items():\n        mlflow.log_metric(f\"label_{label}_count\", count)\n    \n    # Log artifact\n    mlflow.log_artifact(json_path_v1, artifact_path=\"datasets\")\n    \n    # Log file hash\n    file_hash = calculate_file_hash(json_path_v1)\n    mlflow.log_param(\"file_hash_md5\", file_hash)\n    \n    print(f\"‚úÖ JSON Dataset v1 logged!\")\n    print(f\"   Run ID: {run.info.run_id}\")\n\n# %%\n# Log JSON Dataset Version 2\nprint(\"üîÑ Logging JSON Dataset Version 2...\")\n\nwith mlflow.start_run(run_name=\"text_classification_v2\") as run:\n    with open(json_path_v2, 'r') as f:\n        data = json.load(f)\n    \n    stats = get_json_dataset_stats(data)\n    \n    mlflow.log_param(\"dataset_version\", \"2.0.0\")\n    mlflow.log_param(\"dataset_name\", \"text_classification\")\n    mlflow.log_param(\"data_format\", \"JSON\")\n    mlflow.log_param(\"task_type\", \"multi-class classification\")\n    mlflow.log_param(\"num_classes\", stats['unique_labels'])\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    mlflow.log_param(\"changes_from_v1\", \"Doubled sample size\")\n    \n    mlflow.log_metric(\"total_samples\", stats['total_samples'])\n    mlflow.log_metric(\"avg_text_length\", stats['avg_text_length'])\n    mlflow.log_metric(\"num_classes\", stats['unique_labels'])\n    \n    for label, count in stats['label_distribution'].items():\n        mlflow.log_metric(f\"label_{label}_count\", count)\n    \n    mlflow.log_artifact(json_path_v2, artifact_path=\"datasets\")\n    \n    file_hash = calculate_file_hash(json_path_v2)\n    mlflow.log_param(\"file_hash_md5\", file_hash)\n    \n    print(f\"‚úÖ JSON Dataset v2 logged!\")\n    print(f\"   Run ID: {run.info.run_id}\")\n\n# %% [markdown]\n# ---\n# # üì¶ PART 4: Parquet Dataset Versioning\n# ---\n# \n# ## Step 7: Create and Version Parquet Datasets\n# \n# Parquet is a columnar storage format that's efficient for\n# large datasets. It's commonly used in big data pipelines.\n\n# %% [markdown]\n# ### 7.1 Create Parquet Datasets\n\n# %%\ndef create_sales_dataset(version, num_records=10000):\n    \"\"\"Create a sales dataset and save as Parquet.\"\"\"\n    np.random.seed(40 + version)\n    \n    products = ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard', 'Mouse']\n    regions = ['North America', 'Europe', 'Asia', 'South America', 'Africa']\n    \n    data = {\n        'transaction_id': range(1, num_records + 1),\n        'date': pd.date_range(start='2023-01-01', periods=num_records, freq='H'),\n        'product': np.random.choice(products, num_records),\n        'region': np.random.choice(regions, num_records),\n        'quantity': np.random.randint(1, 20, num_records),\n        'unit_price': np.random.uniform(50, 2000, num_records).round(2),\n        'discount': np.random.uniform(0, 0.3, num_records).round(2)\n    }\n    \n    df = pd.DataFrame(data)\n    df['total_amount'] = (df['quantity'] * df['unit_price'] * (1 - df['discount'])).round(2)\n    \n    return df\n\n# Create Parquet datasets\nparquet_df_v1 = create_sales_dataset(version=1, num_records=10000)\nparquet_df_v2 = create_sales_dataset(version=2, num_records=25000)\n\n# Save to Parquet\nparquet_path_v1 = os.path.join(PARQUET_DIR, \"sales_v1.parquet\")\nparquet_path_v2 = os.path.join(PARQUET_DIR, \"sales_v2.parquet\")\n\nparquet_df_v1.to_parquet(parquet_path_v1, index=False)\nparquet_df_v2.to_parquet(parquet_path_v2, index=False)\n\nprint(\"üì¶ Parquet Dataset Version 1:\")\nprint(f\"   Records: {len(parquet_df_v1)}\")\nprint(f\"   Columns: {list(parquet_df_v1.columns)}\")\nprint(f\"   File Size: {os.path.getsize(parquet_path_v1) / 1024:.2f} KB\")\n\nprint(\"\\nüì¶ Parquet Dataset Version 2:\")\nprint(f\"   Records: {len(parquet_df_v2)}\")\nprint(f\"   Columns: {list(parquet_df_v2.columns)}\")\nprint(f\"   File Size: {os.path.getsize(parquet_path_v2) / 1024:.2f} KB\")\n\n# %% [markdown]\n# ### 7.2 Track Parquet Dataset with MLflow\n\n# %%\n# Create experiment for Parquet datasets\nEXPERIMENT_NAME_PARQUET = \"Parquet_Dataset_Versioning_Lab\"\nmlflow.set_experiment(EXPERIMENT_NAME_PARQUET)\n\nprint(f\"üìÅ Experiment: {EXPERIMENT_NAME_PARQUET}\")\n\n# %%\n# Log Parquet Dataset Version 1\nprint(\"üîÑ Logging Parquet Dataset Version 1...\")\n\nwith mlflow.start_run(run_name=\"sales_parquet_v1\") as run:\n    df = pd.read_parquet(parquet_path_v1)\n    \n    # Create MLflow dataset from parquet\n    dataset = mlflow.data.from_pandas(\n        df,\n        source=parquet_path_v1,\n        name=\"sales_transactions\"\n    )\n    \n    mlflow.log_input(dataset, context=\"training\")\n    \n    # Log parameters\n    mlflow.log_param(\"dataset_version\", \"1.0.0\")\n    mlflow.log_param(\"dataset_name\", \"sales_transactions\")\n    mlflow.log_param(\"data_format\", \"Parquet\")\n    mlflow.log_param(\"compression\", \"snappy\")\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    \n    # Log metrics\n    mlflow.log_metric(\"num_records\", len(df))\n    mlflow.log_metric(\"num_columns\", len(df.columns))\n    mlflow.log_metric(\"file_size_kb\", os.path.getsize(parquet_path_v1) / 1024)\n    mlflow.log_metric(\"total_revenue\", df['total_amount'].sum())\n    mlflow.log_metric(\"avg_transaction\", df['total_amount'].mean())\n    mlflow.log_metric(\"unique_products\", df['product'].nunique())\n    mlflow.log_metric(\"unique_regions\", df['region'].nunique())\n    \n    # Log artifact\n    mlflow.log_artifact(parquet_path_v1, artifact_path=\"datasets\")\n    \n    print(f\"‚úÖ Parquet Dataset v1 logged!\")\n    print(f\"   Run ID: {run.info.run_id}\")\n\n# %%\n# Log Parquet Dataset Version 2\nprint(\"üîÑ Logging Parquet Dataset Version 2...\")\n\nwith mlflow.start_run(run_name=\"sales_parquet_v2\") as run:\n    df = pd.read_parquet(parquet_path_v2)\n    \n    dataset = mlflow.data.from_pandas(\n        df,\n        source=parquet_path_v2,\n        name=\"sales_transactions\"\n    )\n    \n    mlflow.log_input(dataset, context=\"training\")\n    \n    mlflow.log_param(\"dataset_version\", \"2.0.0\")\n    mlflow.log_param(\"dataset_name\", \"sales_transactions\")\n    mlflow.log_param(\"data_format\", \"Parquet\")\n    mlflow.log_param(\"compression\", \"snappy\")\n    mlflow.log_param(\"creation_date\", datetime.now().isoformat())\n    mlflow.log_param(\"changes_from_v1\", \"2.5x more records\")\n    \n    mlflow.log_metric(\"num_records\", len(df))\n    mlflow.log_metric(\"num_columns\", len(df.columns))\n    mlflow.log_metric(\"file_size_kb\", os.path.getsize(parquet_path_v2) / 1024)\n    mlflow.log_metric(\"total_revenue\", df['total_amount'].sum())\n    mlflow.log_metric(\"avg_transaction\", df['total_amount'].mean())\n    mlflow.log_metric(\"unique_products\", df['product'].nunique())\n    mlflow.log_metric(\"unique_regions\", df['region'].nunique())\n    \n    mlflow.log_artifact(parquet_path_v2, artifact_path=\"datasets\")\n    \n    print(f\"‚úÖ Parquet Dataset v2 logged!\")\n    print(f\"   Run ID: {run.info.run_id}\")\n\n# %% [markdown]\n# ---\n# # üî¨ PART 5: Using Scikit-learn Built-in Datasets\n# ---\n# \n# ## Step 8: Track Standard ML Datasets\n# \n# MLflow can also track popular datasets from scikit-learn.\n\n# %%\n# Create experiment for sklearn datasets\nEXPERIMENT_NAME_SKLEARN = \"Sklearn_Dataset_Versioning_Lab\"\nmlflow.set_experiment(EXPERIMENT_NAME_SKLEARN)\n\nprint(f\"üìÅ Experiment: {EXPERIMENT_NAME_SKLEARN}\")\n\n# %%\n# Log Iris Dataset\nprint(\"üå∏ Logging Iris Dataset...\")\n\nwith mlflow.start_run(run_name=\"iris_dataset\") as run:\n    # Load dataset\n    iris = load_iris()\n    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)\n    df['target'] = iris.target\n    df['target_name'] = df['target'].map({i: name for i, name in enumerate(iris.target_names)})\n    \n    # Create MLflow dataset\n    dataset = mlflow.data.from_pandas(\n        df,\n        source=\"sklearn.datasets.load_iris\",\n        name=\"iris_dataset\",\n        targets=\"target\"\n    )\n    \n    mlflow.log_input(dataset, context=\"training\")\n    \n    # Log parameters\n    mlflow.log_param(\"dataset_name\", \"iris\")\n    mlflow.log_param(\"source\", \"sklearn.datasets\")\n    mlflow.log_param(\"task_type\", \"classification\")\n    mlflow.log_param(\"num_classes\", len(iris.target_names))\n    mlflow.log_param(\"class_names\", list(iris.target_names))\n    mlflow.log_param(\"feature_names\", list(iris.feature_names))\n    \n    # Log metrics\n    mlflow.log_metric(\"num_samples\", len(df))\n    mlflow.log_metric(\"num_features\", len(iris.feature_names))\n    mlflow.log_metric(\"num_classes\", len(iris.target_names))\n    \n    # Save and log artifact\n    iris_path = os.path.join(CSV_DIR, \"iris.csv\")\n    df.to_csv(iris_path, index=False)\n    mlflow.log_artifact(iris_path, artifact_path=\"datasets\")\n    \n    print(f\"‚úÖ Iris Dataset logged!\")\n    print(f\"   Samples: {len(df)}, Features: {len(iris.feature_names)}\")\n\n# %%\n# Log Wine Dataset\nprint(\"üç∑ Logging Wine Dataset...\")\n\nwith mlflow.start_run(run_name=\"wine_dataset\") as run:\n    wine = load_wine()\n    df = pd.DataFrame(data=wine.data, columns=wine.feature_names)\n    df['target'] = wine.target\n    \n    dataset = mlflow.data.from_pandas(\n        df,\n        source=\"sklearn.datasets.load_wine\",\n        name=\"wine_dataset\",\n        targets=\"target\"\n    )\n    \n    mlflow.log_input(dataset, context=\"training\")\n    \n    mlflow.log_param(\"dataset_name\", \"wine\")\n    mlflow.log_param(\"source\", \"sklearn.datasets\")\n    mlflow.log_param(\"task_type\", \"classification\")\n    mlflow.log_param(\"num_classes\", len(wine.target_names))\n    \n    mlflow.log_metric(\"num_samples\", len(df))\n    mlflow.log_metric(\"num_features\", len(wine.feature_names))\n    mlflow.log_metric(\"num_classes\", len(wine.target_names))\n    \n    wine_path = os.path.join(CSV_DIR, \"wine.csv\")\n    df.to_csv(wine_path, index=False)\n    mlflow.log_artifact(wine_path, artifact_path=\"datasets\")\n    \n    print(f\"‚úÖ Wine Dataset logged!\")\n    print(f\"   Samples: {len(df)}, Features: {len(wine.feature_names)}\")\n\n# %%\n# Log California Housing Dataset\nprint(\"üè† Logging California Housing Dataset...\")\n\nwith mlflow.start_run(run_name=\"california_housing_dataset\") as run:\n    housing = fetch_california_housing()\n    df = pd.DataFrame(data=housing.data, columns=housing.feature_names)\n    df['target'] = housing.target\n    \n    dataset = mlflow.data.from_pandas(\n        df,\n        source=\"sklearn.datasets.fetch_california_housing\",\n        name=\"california_housing\",\n        targets=\"target\"\n    )\n    \n    mlflow.log_input(dataset, context=\"training\")\n    \n    mlflow.log_param(\"dataset_name\", \"california_housing\")\n    mlflow.log_param(\"source\", \"sklearn.datasets\")\n    mlflow.log_param(\"task_type\", \"regression\")\n    \n    mlflow.log_metric(\"num_samples\", len(df))\n    mlflow.log_metric(\"num_features\", len(housing.feature_names))\n    mlflow.log_metric(\"target_mean\", df['target'].mean())\n    mlflow.log_metric(\"target_std\", df['target'].std())\n    \n    housing_path = os.path.join(CSV_DIR, \"california_housing.csv\")\n    df.to_csv(housing_path, index=False)\n    mlflow.log_artifact(housing_path, artifact_path=\"datasets\")\n    \n    print(f\"‚úÖ California Housing Dataset logged!\")\n    print(f\"   Samples: {len(df)}, Features: {len(housing.feature_names)}\")\n\n# %% [markdown]\n# ---\n# # üìä PART 6: Dataset Comparison Dashboard\n# ---\n# \n# ## Step 9: Create a Summary of All Tracked Datasets\n\n# %%\ndef get_all_dataset_runs():\n    \"\"\"Retrieve all dataset runs across experiments.\"\"\"\n    client = mlflow.tracking.MlflowClient()\n    experiments = client.search_experiments()\n    \n    all_runs = []\n    for exp in experiments:\n        if \"Lab\" in exp.name:  # Filter our lab experiments\n            runs = client.search_runs(experiment_ids=[exp.experiment_id])\n            for run in runs:\n                run_info = {\n                    'experiment': exp.name,\n                    'run_name': run.info.run_name,\n                    'run_id': run.info.run_id[:8],  # Shortened\n                    'status': run.info.status,\n                    'dataset_version': run.data.params.get('dataset_version', 'N/A'),\n                    'dataset_name': run.data.params.get('dataset_name', 'N/A'),\n                    'num_samples': run.data.metrics.get('num_samples', \n                                   run.data.metrics.get('num_rows',\n                                   run.data.metrics.get('total_samples',\n                                   run.data.metrics.get('num_records',\n                                   run.data.metrics.get('total_images', 'N/A'))))),\n                }\n                all_runs.append(run_info)\n    \n    return pd.DataFrame(all_runs)\n\n# Get summary\nprint(\"üìä Dataset Tracking Summary\")\nprint(\"=\" * 80)\n\nsummary_df = get_all_dataset_runs()\nif len(summary_df) > 0:\n    print(summary_df.to_string(index=False))\nelse:\n    print(\"No runs found. Make sure MLflow server is running and experiments were logged.\")\n\n# %% [markdown]\n# ---\n# # üéØ PART 7: Best Practices and Tips\n# ---\n# \n# ## Dataset Versioning Best Practices\n# \n# 1. **Always include version numbers** in your dataset parameters\n# 2. **Calculate and log file hashes** for data integrity verification\n# 3. **Log comprehensive metadata** including creation date, source, and changes\n# 4. **Use meaningful run names** that reflect the dataset version\n# 5. **Log sample artifacts** so you can quickly preview data in the UI\n# 6. **Track data statistics** as metrics for easy comparison\n# 7. **Document schema changes** between versions\n\n# %% [markdown]\n# ## üßπ Step 10: Cleanup (Optional)\n# \n# Run this cell to clean up the created directories and files.\n\n# %%\n# Cleanup function (uncomment to use)\ndef cleanup():\n    \"\"\"Remove all created directories and files.\"\"\"\n    if os.path.exists(DATASET_DIR):\n        shutil.rmtree(DATASET_DIR)\n        print(f\"üóëÔ∏è Removed directory: {DATASET_DIR}\")\n    print(\"‚úÖ Cleanup complete!\")\n\n# Uncomment the line below to clean up\n# cleanup()\n\nprint(\"üí° To clean up, uncomment and run the cleanup() function above.\")\n\n# %% [markdown]\n# ---\n# # üìù Lab Exercises\n# ---\n# \n# ## Exercise 1: Create Your Own Dataset Version\n# Create a new version (v3) of the customer dataset with additional features\n# and log it to MLflow.\n# \n# ## Exercise 2: Image Dataset Augmentation\n# Create a v3 of the image dataset with augmented images (rotations, flips)\n# and track the augmentation parameters.\n# \n# ## Exercise 3: Compare Metrics\n# Write code to compare the metrics between two dataset versions\n# and highlight the differences.\n# \n# ## Exercise 4: Dataset Lineage\n# Create a run that logs the relationship between a raw dataset and\n# a processed/cleaned version.\n\n# %% [markdown]\n# ---\n# # üéâ Congratulations!\n# \n# You have completed the MLflow Dataset Versioning and Tracking Lab!\n# \n# ## What You Learned:\n# - ‚úÖ Setting up MLflow tracking\n# - ‚úÖ Versioning CSV datasets\n# - ‚úÖ Tracking image datasets\n# - ‚úÖ Managing JSON datasets\n# - ‚úÖ Working with Parquet files\n# - ‚úÖ Using sklearn built-in datasets\n# - ‚úÖ Comparing dataset versions\n# \n# ## Next Steps:\n# 1. Explore the MLflow UI at http://localhost:5000\n# 2. Try connecting datasets to model training runs\n# 3. Implement automated dataset validation\n# 4. Set up dataset alerts for drift detection\n# \n# ---\n# **Happy Tracking! üöÄ**\n```\n\n## How to Use This Lab\n\n### Prerequisites\n\n1. **Start MLflow Server:**\n```bash\nmlflow server --host 0.0.0.0 --port 5000\n```\n\n2. **Install Required Packages:**\n```bash\npip install mlflow pandas numpy scikit-learn pillow pyarrow\n```\n\n### Running the Lab\n\n1. Save the code as `mlflow_dataset_lab.py`\n2. Open in VS Code or JupyterLab\n3. The `# %%` markers create executable cells\n4. Run cells sequentially\n\n### Lab Structure\n\n| Part | Content | Dataset Types |\n|------|---------|---------------|\n| 1 | CSV Dataset Versioning | Customer churn data |\n| 2 | Image Dataset Versioning | Geometric shapes |\n| 3 | JSON Dataset Versioning | Text classification |\n| 4 | Parquet Dataset Versioning | Sales transactions |\n| 5 | Sklearn Datasets | Iris, Wine, Housing |\n| 6 | Comparison Dashboard | All datasets |\n| 7 | Best Practices | Tips & Exercises |"
    }
  ],
    "output_tokens": 13290
  
}

In [2]:
# Simple extraction
text = response["content"][0]["text"]
print(text)

# Save to file
with open("output.txt", "w", encoding="utf-8") as f:
    f.write(text)

# MLflow Dataset Versioning and Tracking Lab

```python
# %% [markdown]
# # üß™ MLflow Dataset Versioning and Tracking Lab
# 
# ## Learning Objectives
# By the end of this lab, you will be able to:
# - Understand MLflow's dataset tracking capabilities
# - Version different types of datasets (CSV, Images, JSON, Parquet)
# - Track dataset metadata and lineage
# - Compare dataset versions across experiments
# 
# ## Prerequisites
# - Python 3.8+
# - MLflow installed
# - Basic understanding of machine learning workflows

# %% [markdown]
# ## üì¶ Step 1: Environment Setup
# 
# First, let's install and import all necessary libraries.
# Run the cell below to set up your environment.

# %%
# Install required packages (uncomment if needed)
# !pip install mlflow pandas numpy scikit-learn pillow pyarrow boto3

# %%
# Import required libraries
import mlflow
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.data.numpy_dataset import NumpyDataset
import mlflow.data as mlflow_data

im