## Install Libraries

In [None]:
# !pip install pandas scikit-learn pyarrow pandera ydata-profiling fastapi uvicorn joblib wandb alibi-detect



## Import Libraries

In [11]:
import joblib
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import wandb

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from sklearn.model_selection import (
    KFold,
    cross_val_score,
    train_test_split
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from ydata_profiling import ProfileReport


import pandera as pa
from pandera import Column, DataFrameSchema, Check


## 1. Load Dataset


In [12]:
# Load the dataset from UCI Repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# Define column names as per UCI documentation
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'sex',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]

# Load dataset into a pandas DataFrame
df = pd.read_csv(url, header=None, names=columns, skipinitialspace=True)

# Save the original dataset in csv format
df.to_csv('datasets/adult_income.csv', index=False)

In [13]:
# Drop the first row
df = df.drop(index=0).reset_index(drop=True)

In [14]:
# display the first 5 rows
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [15]:
# Check for null values in the DataFrame
null_values = df.isnull().sum()
print(null_values)

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [16]:
categorical_columns = df.select_dtypes(include=['object']).columns

for col in categorical_columns:
    print(f"Categorical Feature - {col}:")
    print(df[col].unique())
    print("\n")

Categorical Feature - workclass:
['Self-emp-not-inc' 'Private' 'State-gov' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']


Categorical Feature - education:
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']


Categorical Feature - marital_status:
['Married-civ-spouse' 'Divorced' 'Married-spouse-absent' 'Never-married'
 'Separated' 'Married-AF-spouse' 'Widowed']


Categorical Feature - occupation:
['Exec-managerial' 'Handlers-cleaners' 'Prof-specialty' 'Other-service'
 'Adm-clerical' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']


Categorical Feature - relationship:
['Husband' 'Not-in-family' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']


Categorical Feature - race:
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other

In [17]:
# Replace "?" with None in categorical features
df[categorical_columns] = df[categorical_columns].replace('?', 'None')

## 2. Dataset Schema and Storage


In [None]:
# Define the dataset schema using Pandera
schema = DataFrameSchema({
    "age": Column(int, checks=Check.ge(17), nullable=False),
    "workclass": Column(str, checks=Check.isin([
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov", "Local-gov", 
        "Self-emp-inc", "Without-pay", "Never-worked", "None"
    ]), nullable=False),  
    "fnlwgt": Column(int, nullable=False),
    "education": Column(str, checks=Check.isin([
        "Bachelors", "HS-grad", "11th", "Masters", "9th", "Some-college", 
        "Assoc-acdm", "Assoc-voc", "7th-8th", "Doctorate", "Prof-school", 
        "5th-6th", "10th", "1st-4th", "Preschool", "12th", "None"
    ]), nullable=False),
    "education_num": Column(int, checks=[Check.ge(1), Check.le(16)], nullable=False),
    "marital_status": Column(str, checks=Check.isin([
        "Married-civ-spouse", "Divorced", "Married-spouse-absent", "Never-married", 
        "Separated", "Married-AF-spouse", "Widowed", "None"
    ]), nullable=False),
    "occupation": Column(str, checks=Check.isin([
        "Exec-managerial", "Handlers-cleaners", "Prof-specialty", "Other-service", 
        "Adm-clerical", "Sales", "Craft-repair", "Transport-moving", 
        "Farming-fishing", "Machine-op-inspct", "Tech-support", 
        "Protective-serv", "Armed-Forces", "Priv-house-serv", "None"
    ]), nullable=False),
    "relationship": Column(str, checks=Check.isin([
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried", "Other-relative", "None"
    ]), nullable=False),
    "race": Column(str, checks=Check.isin([
        "White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "None"
    ]), nullable=False),
    "sex": Column(str, checks=Check.isin(["Male", "Female"]), nullable=False),
    "capital_gain": Column(int, checks=Check.ge(0), nullable=False),
    "capital_loss": Column(int, checks=Check.ge(0), nullable=False),
    "hours_per_week": Column(int, checks=[Check.ge(1), Check.le(99)], nullable=False),
    "native_country": Column(str, checks=Check.isin([
        "United-States", "Cuba", "Jamaica", "India", "Mexico", "South", 
        "Puerto-Rico", "Honduras", "England", "Canada", "Germany", "Iran", 
        "Philippines", "Italy", "Poland", "Columbia", "Cambodia", "Thailand", 
        "Ecuador", "Laos", "Taiwan", "Haiti", "Portugal", "Dominican-Republic", 
        "El-Salvador", "France", "Guatemala", "China", "Japan", "Yugoslavia", 
        "Peru", "Outlying-US(Guam-USVI-etc)", "Scotland", "Trinadad&Tobago", 
        "Greece", "Nicaragua", "Vietnam", "Hong", "Ireland", "Hungary", 
        "Holand-Netherlands", "None"
    ]), nullable=False),
    "income": Column(str, checks=Check.isin(["<=50K", ">50K"]), nullable=False)  
})

# Validate the DataFrame against the schema
validated_df = schema.validate(df)

# Save validated dataset as Parquet
validated_df.to_parquet("datasets/adult_income.parquet", engine="pyarrow", index=False)

print("Schema validation passed! Data stored in 'datasets/adult_income.parquet'.")


Schema validation passed! Data stored in 'datasets/adult_income.parquet'.


## 3. Profiling the Dataset

In [19]:
#Load the Parquet file
parquet_file = 'datasets/adult_income.parquet'
df = pd.read_parquet(parquet_file)

In [20]:
#Generate the profile report
profile = ProfileReport(df, title="Adult Income Dataset Profile", explorative=True)

In [21]:
#Save the profile report as an HTML file
profile_file = "profiling_report/adult_income_profile.html"
profile.to_file(profile_file)
print(f"✅ Profile report saved as '{profile_file}'.")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Profile report saved as 'profiling_report/adult_income_profile.html'.


In [22]:
from IPython.core.display import HTML

HTML(profile.to_notebook_iframe())

<IPython.core.display.HTML object>

## 4. Train-Test Split


In [23]:
#Set a random seed for reproducibility
RANDOM_SEED = 42

In [24]:
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=RANDOM_SEED, stratify=df['income'])

In [25]:
test_df, prod_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_SEED, stratify=temp_df['income'])

In [26]:
#Display the size of each split
print(f"Train Set Shape: {train_df.shape}")
print(f"Test Set Shape: {test_df.shape}")
print(f"Production Set Shape: {prod_df.shape}")

Train Set Shape: (19536, 15)
Test Set Shape: (6512, 15)
Production Set Shape: (6512, 15)


In [27]:
#Save the datasets as Parquet files
train_file = 'datasets/adult_income_train.parquet'
test_file = 'datasets/adult_income_test.parquet'
prod_file = 'datasets/adult_income_prod.parquet'

In [28]:
train_df.to_parquet(train_file, engine='pyarrow', index=False)
test_df.to_parquet(test_file, engine='pyarrow', index=False)
prod_df.to_parquet(prod_file, engine='pyarrow', index=False)

In [29]:
print("Datasets saved as Parquet files.")

Datasets saved as Parquet files.


## 5. Data Version Control


In [30]:
# git initialization and setup remote repository
!git init
!git remote add origin https://github.com/PatelNisarg28/MLOps_Adult_Income.git

# Add the dataset files and profiling report
!git add datasets/*
!git add profiling_report/*
!git commit -m "Added dataset files and profiling report"

# Push the changes to the remote repository
!git branch -M main
!git push -u origin main

Reinitialized existing Git repository in /Users/nisarg/Documents/JIO INSTITUTE/Quarter4/ML Engineering/mlops-adult-income/.git/
error: remote origin already exists.
[main b0c2ccc] Added dataset files and profiling report
 5 files changed, 1028 insertions(+), 1062 deletions(-)
Enumerating objects: 17, done.
Counting objects: 100% (17/17), done.
Delta compression using up to 8 threads
Compressing objects: 100% (9/9), done.
Writing objects: 100% (9/9), 205.04 KiB | 5.70 MiB/s, done.
Total 9 (delta 6), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (6/6), completed with 6 local objects.[K
To https://github.com/PatelNisarg28/MLOps_Adult_Income.git
   f7b97f2..b0c2ccc  main -> main
branch 'main' set up to track 'origin/main'.


## 6. ML Pipeline with Scikit-Learn

In [31]:
#Load the dataset directly from GitHub (raw link)
github_raw_url = "https://raw.githubusercontent.com/PatelNisarg28/MLOps_Adult_Income/main/datasets/adult_income.parquet"

# Read Parquet file
df = pd.read_parquet(github_raw_url, engine="pyarrow")  # Ensure pyarrow is installed

df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [39]:
# Split features and target variable
X = df.drop(columns=["income"])  
y = df["income"]

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Define transformers
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # Fill missing values with mean
    ("scaler", StandardScaler())  # Standardize numerical data
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing values with mode
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # Convert categorical data to numbers
])

# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Create full ML pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))  
])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train pipeline
pipeline.fit(X_train, y_train)

In [40]:
# Evaluate model
accuracy = pipeline.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.85


## 7. ML Experimentation and Tracking with Weights and Biases


In [33]:
wandb.login(key="bc285b8a51f65e71f616ef32a79004019117db3b")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/nisarg/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnisarg-patel2815[0m ([33mnisarg-patel2815-jio-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [34]:
# GitHub raw URLs for train and test datasets
train_url = "https://raw.githubusercontent.com/PatelNisarg28/MLOps_Adult_Income/main/datasets/adult_income_train.parquet"
test_url = "https://raw.githubusercontent.com/PatelNisarg28/MLOps_Adult_Income/main/datasets/adult_income_test.parquet"

# Load datasets
train_df = pd.read_parquet(train_url)
test_df = pd.read_parquet(test_url)

# Inspect dataset
train_df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,36,Private,184655,10th,6,Divorced,Transport-moving,Unmarried,White,Male,0,0,48,United-States,<=50K
1,69,Local-gov,32287,10th,6,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,25,United-States,<=50K
2,43,Self-emp-not-inc,277647,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,35,United-States,<=50K
3,41,State-gov,210094,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,29,Private,213842,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K


In [35]:
def run_experiment(model, model_name, hyperparams={}):
    """Run a classification ML experiment with Weights & Biases tracking"""
    
    wandb.init(project="MLOps_Adult_Income", name=model_name, config=hyperparams)

    # Create pipeline
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    # Perform K-Fold Cross-Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring="accuracy")
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = pipeline.predict(X_test)
    
    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    # Log metrics to W&B
    wandb.log({
        "CV Accuracy Score": np.mean(cv_scores),
        "Test Accuracy": accuracy,
        "Test Precision": precision,
        "Test Recall": recall,
        "Test F1 Score": f1
    })
    
    # Save the model locally
    joblib.dump(pipeline, f"models/{model_name}.pkl")
    
    # Log model artifact to W&B
    artifact = wandb.Artifact(model_name, type="model")
    artifact.add_file(f"models/{model_name}.pkl")
    wandb.log_artifact(artifact)

    wandb.finish()
    
    print(f"Experiment completed for {model_name} - Test Accuracy: {accuracy:.4f}")
    
    return accuracy, pipeline  # Return Accuracy and trained pipeline


In [36]:
models = {
    "Logistic_Regression": (LogisticRegression(solver="liblinear"), {"solver": "liblinear"}),
    "Decision_Tree": (DecisionTreeClassifier(max_depth=10), {"max_depth": 10}),
    "Random_Forest": (RandomForestClassifier(n_estimators=100, max_depth=10), {"n_estimators": 100, "max_depth": 10}),
    "Support_Vector_Machine": (SVC(kernel="rbf", C=1.0), {"kernel": "rbf", "C": 1.0}),
    "Gradient_Boosting": (GradientBoostingClassifier(n_estimators=100, learning_rate=0.1), {"n_estimators": 100, "learning_rate": 0.1}),
}

best_score = -np.inf
best_model = None

for model_name, (model, params) in models.items():
    score, trained_model = run_experiment(model, model_name, params)
    
    # Store best model
    if score > best_score:
        best_score = score
        best_model = trained_model

print("All experiments completed!")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


0,1
CV Accuracy Score,▁
Test Accuracy,▁
Test F1 Score,▁
Test Precision,▁
Test Recall,▁

0,1
CV Accuracy Score,0.85204
Test Accuracy,0.85273
Test F1 Score,0.8485
Test Precision,0.84727
Test Recall,0.85273


Experiment completed for Logistic_Regression - Test Accuracy: 0.8527


0,1
CV Accuracy Score,▁
Test Accuracy,▁
Test F1 Score,▁
Test Precision,▁
Test Recall,▁

0,1
CV Accuracy Score,0.85385
Test Accuracy,0.85565
Test F1 Score,0.84861
Test Precision,0.84954
Test Recall,0.85565


Experiment completed for Decision_Tree - Test Accuracy: 0.8557


0,1
CV Accuracy Score,▁
Test Accuracy,▁
Test F1 Score,▁
Test Precision,▁
Test Recall,▁

0,1
CV Accuracy Score,0.85738
Test Accuracy,0.85412
Test F1 Score,0.84499
Test Precision,0.84826
Test Recall,0.85412


Experiment completed for Random_Forest - Test Accuracy: 0.8541


0,1
CV Accuracy Score,▁
Test Accuracy,▁
Test F1 Score,▁
Test Precision,▁
Test Recall,▁

0,1
CV Accuracy Score,0.85534
Test Accuracy,0.85627
Test F1 Score,0.85038
Test Precision,0.8503
Test Recall,0.85627


Experiment completed for Support_Vector_Machine - Test Accuracy: 0.8563


0,1
CV Accuracy Score,▁
Test Accuracy,▁
Test F1 Score,▁
Test Precision,▁
Test Recall,▁

0,1
CV Accuracy Score,0.86498
Test Accuracy,0.86824
Test F1 Score,0.86285
Test Precision,0.86345
Test Recall,0.86824


Experiment completed for Gradient_Boosting - Test Accuracy: 0.8682
All experiments completed!


In [42]:
print(best_model.named_steps['model'])

GradientBoostingClassifier()
