### Implement Data Quality Controls in a Machine Learning Pipeline
**Description**: Integrate data quality checks directly into a machine learning pipeline using a framework like DAMA DMBOK.

In [None]:
# Write your code from here
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Simulated data ingestion step (manually creating a dataset)
data = {
    "age": [25, 32, 47, None, 51, 62, None, 45],
    "salary": [50000, 60000, 80000, 75000, None, 120000, 90000, 95000],
    "department": ["HR", "Engineering", "Engineering", "HR", "Marketing", None, "Marketing", "HR"],
    "left_company": [0, 1, 0, 0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

# === Step 1: Data Governance - Define quality rules ===
def check_missing_values(df):
    missing = df.isnull().sum()
    print("Missing values per column:\n", missing)
    return missing

def check_data_types(df, expected_types):
    mismatches = {}
    for col, expected in expected_types.items():
        if col in df.columns:
            actual = df[col].dtype
            if actual != expected:
                mismatches[col] = (actual, expected)
    return mismatches

expected_types = {
    "age": "float64",
    "salary": "float64",
    "department": "object",
    "left_company": "int64"
}

# Run governance checks
missing_values = check_missing_values(df)
type_mismatches = check_data_types(df, expected_types)

if missing_values.any():
    print("Data contains missing values, proceeding to clean...")

# === Step 2: Data Cleansing ===
# Fill numeric missing values with median
df['age'].fillna(df['age'].median(), inplace=True)
df['salary'].fillna(df['salary'].median(), inplace=True)

# Fill categorical missing with mode
df['department'].fillna(df['department'].mode()[0], inplace=True)

print("\nAfter cleaning missing values:")
print(df)

# === Step 3: Data Validation before training ===
if df.isnull().sum().sum() == 0 and len(type_mismatches) == 0:
    print("\nData quality checks passed. Proceeding with model training...")
else:
    raise ValueError("Data quality checks failed. Fix issues before training.")

# === Step 4: Train simple ML model ===
X = pd.get_dummies(df.drop("left_company", axis=1))
y = df["left_company"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print("\nModel Accuracy:", accuracy_score(y_test, preds))
