In [None]:
# breast_cancer_dataset_form.ipynb
# This notebook documents the Breast Cancer Wisconsin dataset structure

import json
import yaml
import pandas as pd
import numpy as np
from pathlib import Path

# Load the dataset
df = pd.read_csv('data.csv')

# Create the form structure in Python dictionary
breast_cancer_form = {
    "form_name": "Breast Cancer Diagnosis Prediction",
    "form_description": "Predict whether a breast tumor is malignant or benign based on cell nucleus measurements from digitized images.",
    "dataset_info": {
        "source": "UCI Machine Learning Repository - Breast Cancer Wisconsin (Diagnostic) Dataset",
        "samples": len(df),
        "features": len(df.columns) - 2,  # excluding id and diagnosis
        "target": "diagnosis (M/B)",
        "missing_values": "None",
        "class_distribution": {
            "Benign (B)": len(df[df['diagnosis'] == 'B']),
            "Malignant (M)": len(df[df['diagnosis'] == 'M'])
        }
    },
    "fields": [
        {
            "name": "id",
            "type": "identifier",
            "description": "Unique ID for each sample",
            "dtype": "int64"
        },
        {
            "name": "diagnosis",
            "type": "target",
            "description": "Diagnosis result (M = Malignant, B = Benign)",
            "dtype": "object",
            "values": ["M", "B"]
        }
    ]
}

# Add feature fields with their descriptions
feature_descriptions = {
    # Mean Features
    "radius_mean": "Mean of distances from center to points on the perimeter",
    "texture_mean": "Standard deviation of gray-scale values",
    "perimeter_mean": "Mean size of the core tumor",
    "area_mean": "Mean area of the tumor",
    "smoothness_mean": "Mean of local variation in radius lengths",
    "compactness_mean": "Mean of perimeterÂ² / area - 1.0",
    "concavity_mean": "Mean of severity of concave portions of the contour",
    "concave points_mean": "Mean number of concave portions of the contour",
    "symmetry_mean": "Mean symmetry of the tumor",
    "fractal_dimension_mean": "Mean 'coastline approximation' - 1",
    
    # Standard Error Features
    "radius_se": "Standard error of radius measurements",
    "texture_se": "Standard error of texture measurements",
    "perimeter_se": "Standard error of perimeter measurements",
    "area_se": "Standard error of area measurements",
    "smoothness_se": "Standard error of smoothness measurements",
    "compactness_se": "Standard error of compactness measurements",
    "concavity_se": "Standard error of concavity measurements",
    "concave points_se": "Standard error of concave points measurements",
    "symmetry_se": "Standard error of symmetry measurements",
    "fractal_dimension_se": "Standard error of fractal dimension measurements",
    
    # Worst Features
    "radius_worst": "Worst (largest) radius value",
    "texture_worst": "Worst (largest) texture value",
    "perimeter_worst": "Worst (largest) perimeter value",
    "area_worst": "Worst (largest) area value",
    "smoothness_worst": "Worst (largest) smoothness value",
    "compactness_worst": "Worst (largest) compactness value",
    "concavity_worst": "Worst (largest) concavity value",
    "concave points_worst": "Worst (largest) concave points value",
    "symmetry_worst": "Worst (largest) symmetry value",
    "fractal_dimension_worst": "Worst (largest) fractal dimension value"
}

# Add all feature fields to the form
for feature in df.columns:
    if feature not in ['id', 'diagnosis']:
        breast_cancer_form["fields"].append({
            "name": feature,
            "type": "feature",
            "description": feature_descriptions.get(feature, "Not specified"),
            "dtype": str(df[feature].dtype),
            "statistics": {
                "min": float(df[feature].min()),
                "max": float(df[feature].max()),
                "mean": float(df[feature].mean()),
                "std": float(df[feature].std())
            }
        })

# Function to save in different formats
def save_form_data(form_data, base_filename="breast_cancer_dataset_form"):
    """Save the form data in multiple formats"""
    
    # Save as JSON
    with open(f"{base_filename}.json", 'w') as f:
        json.dump(form_data, f, indent=2)
    
    # Save as YAML
    with open(f"{base_filename}.yaml", 'w') as f:
        yaml.dump(form_data, f, default_flow_style=False)
    
    # Save as Python dictionary
    with open(f"{base_filename}.py", 'w') as f:
        f.write(f"breast_cancer_form = {json.dumps(form_data, indent=2)}")
    
    # Create a summary CSV
    summary_data = []
    for field in form_data["fields"]:
        if field["type"] == "feature":
            stats = field.get("statistics", {})
            summary_data.append({
                "feature_name": field["name"],
                "description": field["description"],
                "dtype": field["dtype"],
                "min_value": stats.get("min", "N/A"),
                "max_value": stats.get("max", "N/A"),
                "mean_value": stats.get("mean", "N/A"),
                "std_value": stats.get("std", "N/A")
            })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv(f"{base_filename}_summary.csv", index=False)
    
    return True

# Save the form data
save_form_data(breast_cancer_form)

# Create a Jupyter Notebook cell content
notebook_content = {
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# Breast Cancer Wisconsin Dataset Form\n",
                "## Overview\n",
                f"This dataset contains {len(df)} samples with {len(df.columns) - 2} features for breast cancer diagnosis prediction.\n",
                "\n",
                "### Target Variable\n",
                "- **diagnosis**: M (Malignant) or B (Benign)\n",
                f"- Class distribution: {len(df[df['diagnosis'] == 'B'])} Benign, {len(df[df['diagnosis'] == 'M'])} Malignant\n",
                "\n",
                "### Feature Categories\n",
                "1. **Mean Features** (10 features): Mean values of various measurements\n",
                "2. **Standard Error Features** (10 features): Standard error of measurements\n",
                "3. **Worst Features** (10 features): Worst (largest) values observed\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": None,
            "metadata": {},
            "outputs": [],
            "source": [
                "# Display dataset info\n",
                "print(f'Dataset shape: {df.shape}')\n",
                "print(f'\\nFirst few rows:')\n",
                "print(df.head())\n",
                "\n",
                "print(f'\\nData types:')\n",
                "print(df.dtypes)\n",
                "\n",
                "print(f'\\nMissing values:')\n",
                "print(df.isnull().sum())\n",
                "\n",
                "print(f'\\nClass distribution:')\n",
                "print(df['diagnosis'].value_counts())"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": None,
            "metadata": {},
            "outputs": [],
            "source": [
                "# Feature statistics\n",
                "print('Feature Statistics:\\n')\n",
                "numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()\n",
                "stats_df = df[numeric_features].describe().transpose()\n",
                "print(stats_df.round(4))"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": None,
            "metadata": {},
            "outputs": [],
            "source": [
                "# Export the form structure\n",
                "import json\n",
                "\n",
                "with open('breast_cancer_form.json', 'w') as f:\n",
                "    json.dump(breast_cancer_form, f, indent=2)\n",
                "    \n",
                "print('Form structure saved to breast_cancer_form.json')"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}

# Save as Jupyter Notebook
with open('breast_cancer_dataset_form.ipynb', 'w') as f:
    json.dump(notebook_content, f, indent=2)

print("Form documentation created successfully!")
print("Files saved:")
print("1. breast_cancer_dataset_form.json")
print("2. breast_cancer_dataset_form.yaml")
print("3. breast_cancer_dataset_form.py")
print("4. breast_cancer_dataset_form_summary.csv")
print("5. breast_cancer_dataset_form.ipynb (Jupyter Notebook)")