# Notebook 0 ‚Äî Introduction & Goal

**DIQ Project**: Metropolitan Museum of Art - Department Classification

**Team**: 3-person Data Science pipeline  
**Date**: January 2026

## 0.1 ‚Äî Project Overview

In [None]:
# Cell 1: Project Overview
print("="*80)
print("DIQ PROJECT: Metropolitan Museum of Art - Department Classification")
print("="*80)
print("\nüìä Project Details:")
print("  ‚Ä¢ Dataset: MET Museum Objects (met_museum_objects.csv)")
print("  ‚Ä¢ Team Size: 3-person Data Science pipeline")
print("  ‚Ä¢ ML Task: Multi-class Classification")
print("  ‚Ä¢ Target Variable: Department")
print("  ‚Ä¢ Goal: Demonstrate impact of Data Quality on ML performance")
print("\n" + "="*80)

## 0.2 ‚Äî Why Department as Target?

In [None]:
# Cell 2: Load and examine target variable
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style for all plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load dataset
print("Loading dataset...")
df = pd.read_csv('met_museum_objects.csv', low_memory=False)
print(f"‚úì Dataset loaded: {df.shape[0]:,} records, {df.shape[1]} columns\n")

# Analyze Department column
print("="*80)
print("TARGET VARIABLE ANALYSIS: Department")
print("="*80)

# 1. Missing values
missing_count = df['Department'].isnull().sum()
missing_pct = (missing_count / len(df)) * 100
print(f"\n1. Completeness:")
print(f"   ‚Ä¢ Missing values: {missing_count} ({missing_pct:.2f}%)")
print(f"   ‚úì Perfect for ML target (0% missing)")

# 2. Number of classes
n_classes = df['Department'].nunique()
print(f"\n2. Number of Classes:")
print(f"   ‚Ä¢ Unique departments: {n_classes}")
print(f"   ‚úì Manageable for multi-class classification")

# 3. Class distribution
print(f"\n3. Class Distribution:")
dept_counts = df['Department'].value_counts()
print(f"   ‚Ä¢ Largest class: {dept_counts.iloc[0]:,} records ({dept_counts.iloc[0]/len(df)*100:.1f}%)")
print(f"   ‚Ä¢ Smallest class: {dept_counts.iloc[-1]:,} records ({dept_counts.iloc[-1]/len(df)*100:.1f}%)")
print(f"   ‚Ä¢ Imbalance ratio: {dept_counts.iloc[0]/dept_counts.iloc[-1]:.0f}:1")

# 4. Semantic meaning
print(f"\n4. Semantic Meaningfulness:")
print(f"   ‚úì Clear, interpretable labels")
print(f"   ‚úì Aligns with museum organizational structure")
print(f"   ‚úì Useful for real-world applications (object categorization)")

print("\n" + "="*80)

## 0.3 ‚Äî Visualize Target Distribution

In [None]:
# Cell 3: Visualize Department distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Bar chart of all departments
dept_counts = df['Department'].value_counts()
colors = sns.color_palette("viridis", len(dept_counts))

ax1.barh(range(len(dept_counts)), dept_counts.values, color=colors)
ax1.set_yticks(range(len(dept_counts)))
ax1.set_yticklabels(dept_counts.index, fontsize=9)
ax1.set_xlabel('Number of Objects', fontsize=12, fontweight='bold')
ax1.set_title('Distribution of Objects Across Departments', fontsize=14, fontweight='bold')
ax1.invert_yaxis()

# Add value labels
for i, v in enumerate(dept_counts.values):
    ax1.text(v + 1000, i, f'{v:,}', va='center', fontsize=8)

# Plot 2: Pie chart showing top 5 vs others
top5 = dept_counts.head(5)
others = pd.Series({'Other (15 depts)': dept_counts[5:].sum()})
pie_data = pd.concat([top5, others])

colors_pie = sns.color_palette("Set2", len(pie_data))
wedges, texts, autotexts = ax2.pie(pie_data.values, 
                                     labels=pie_data.index, 
                                     autopct='%1.1f%%',
                                     colors=colors_pie,
                                     startangle=90)

for text in texts:
    text.set_fontsize(9)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(9)

ax2.set_title('Top 5 Departments vs Others', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('department_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Visualization saved as 'department_distribution.png'")

## 0.4 ‚Äî Justification Summary

In [None]:
# Cell 4: Summary table for slides
justification_data = {
    'Criterion': [
        'Completeness',
        'Number of Classes',
        'Class Balance',
        'Interpretability',
        'Business Relevance',
        'ML Suitability'
    ],
    'Status': [
        '0% missing',
        '20 classes',
        'Imbalanced (996:1)',
        'Clear labels',
        'High',
        'Excellent'
    ],
    'Assessment': [
        '‚úì Perfect',
        '‚úì Manageable',
        '‚ö† Needs handling',
        '‚úì Good',
        '‚úì Good',
        '‚úì Good'
    ]
}

justification_df = pd.DataFrame(justification_data)

print("\n" + "="*80)
print("JUSTIFICATION FOR CHOOSING DEPARTMENT AS TARGET")
print("="*80)
print(justification_df.to_string(index=False))
print("="*80)

# Create a styled table for slides
fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('tight')
ax.axis('off')

table = ax.table(cellText=justification_df.values,
                colLabels=justification_df.columns,
                cellLoc='left',
                loc='center',
                colWidths=[0.3, 0.35, 0.35])

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2)

# Style header
for i in range(len(justification_df.columns)):
    table[(0, i)].set_facecolor('#4472C4')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Alternate row colors
for i in range(1, len(justification_df) + 1):
    for j in range(len(justification_df.columns)):
        if i % 2 == 0:
            table[(i, j)].set_facecolor('#E7E6E6')
        else:
            table[(i, j)].set_facecolor('#FFFFFF')

plt.title('Department as Target Variable - Justification', 
          fontsize=14, fontweight='bold', pad=20)
plt.savefig('target_justification.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Justification table saved as 'target_justification.png'")

## 0.5 ‚Äî Project Roadmap

In [None]:
# Cell 5: Show project roadmap
print("\n" + "="*80)
print("PROJECT ROADMAP")
print("="*80)

roadmap = """
üìç Notebook 0: Introduction & Goal
   ‚îî‚îÄ Define project objectives and target variable

üìç Notebook 1: Data Profiling & DQ Assessment
   ‚îî‚îÄ Understand data structure and quality issues
   ‚îî‚îÄ Assess DQ dimensions (completeness, consistency, validity)
   ‚îî‚îÄ Identify problems to address

üìç Notebook 2: Data Cleaning
   ‚îî‚îÄ Handle missing values
   ‚îî‚îÄ Detect and remove outliers
   ‚îî‚îÄ Remove duplicates
   ‚îî‚îÄ Validate dependencies and associations
   ‚îî‚îÄ Address class imbalance

üìç Notebook 3: ML Pipeline
   ‚îî‚îÄ Validate spatial and temporal data quality
   ‚îî‚îÄ Build baseline model (dirty data)
   ‚îî‚îÄ Build improved model (clean data)
   ‚îî‚îÄ Compare performance

üìç Notebook 4: Conclusions
   ‚îî‚îÄ Summarize DQ improvements
   ‚îî‚îÄ Demonstrate ML performance gains
   ‚îî‚îÄ Provide recommendations
"""

print(roadmap)
print("="*80)

print("\nüéØ Key Metrics to Track:")
print(f"   ‚Ä¢ Total records: {len(df):,}")
print(f"   ‚Ä¢ Target variable: Department")
print(f"   ‚Ä¢ Missing in target: {missing_pct:.2f}%")
print(f"   ‚Ä¢ Number of classes: {n_classes}")
print(f"   ‚Ä¢ Imbalance ratio: {dept_counts.iloc[0]/dept_counts.iloc[-1]:.0f}:1")

print("\n‚úÖ Notebook 0 Complete!")
print("‚û°Ô∏è  Next: Proceed to Notebook 1 for Data Profiling & DQ Assessment")