# Census Income Prediction (scikit-learn)

<a href="https://colab.research.google.com/github/VertaAI/examples/blob/main/deployment/sklearn/sklearn-census-income-prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Dependencies

This notebook has been tested with **Python 3.8.16** and the following package versions:

In [None]:
%%capture
!pip install scikit-learn==1.0.2
!pip install verta==0.21.1
!pip install wget==3.2

## 2. Imports

In [None]:
import cloudpickle
import itertools
import os
import pandas as pd
import warnings
import wget

from sklearn import linear_model
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from verta import Client
from verta.dataset import Path
from verta.environment import Python
from verta.registry import VertaModelBase, verify_io
from verta.utils import ModelAPI

warnings.filterwarnings('ignore', category=ConvergenceWarning, module='sklearn')
warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

## 3. Verta Set Up

In [None]:
# Use local env vars or uncomment and fill out the lines below:
# os.environ['VERTA_EMAIL'] = ''
# os.environ['VERTA_DEV_KEY'] = ''
# os.environ['VERTA_HOST'] = ''

In [None]:
client = Client()

In [None]:
MODEL_NAME = 'Census Income Prediction (Example)'
VERSION = 'v0'
ENDPOINT_NAME = 'census-income-prediction'

In [None]:
proj = client.set_project(MODEL_NAME)
expt = client.set_experiment(MODEL_NAME)

## 4. Model Training

### 4.1 Training Data

In [None]:
file_name = 'adult.data'

In [None]:
if not os.path.isfile(file_name):
    wget.download(f"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/{file_name}")

In [None]:
dataset = client.set_dataset(name=f"{MODEL_NAME} - Dataset")

In [None]:
dataset_version = dataset.create_version(Path(file_name))

In [None]:
df = pd.read_csv(file_name)

In [None]:
df.columns = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    '<=50K'
]

In [None]:
df = df.dropna(axis=0)

In [None]:
obj_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [None]:
df[obj_cols] = df[obj_cols].apply(LabelEncoder().fit_transform)

In [None]:
feature_cols = list(df.columns[:-1])
target_col = df.columns[-1]

In [None]:
X = df.loc[:,feature_cols]
y = df.loc[:,target_col]

### 4.2 Train/Test Code

In [None]:
def run_experiment(X, y, hyperparams):
    run = client.set_experiment_run()
    (X_val_train, X_val_test, y_val_train, y_val_test) = train_test_split(X, y, test_size=0.2, shuffle=True)

    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X, y)
    
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric('val_acc', val_acc)
    print(f"Validation accuracy: {round(val_acc, 4)}")
    
    run.log_dataset_version('train', dataset_version)
    
    run.log_code(autocapture=False)

In [None]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4],
    'solver': ['lbfgs'],
    'max_iter': [15, 28],
}

In [None]:
hyperparam_sets = [
    dict(zip(hyperparam_candidates.keys(), values))
    for values in itertools.product(*hyperparam_candidates.values())
]

In [None]:
for hyperparams in hyperparam_sets:
    run_experiment(X, y, hyperparams)

In [None]:
best_run = expt.expt_runs.sort('metrics.val_acc', descending=True)[0]
best_hyperparams = best_run.get_hyperparameters()

In [None]:
print(f"Validation Accuracy: {round(best_run.get_metric('val_acc'), 4)}")
print(f"Hyperparameters: {best_hyperparams}")

In [None]:
model = linear_model.LogisticRegression(multi_class='auto', **best_hyperparams)

In [None]:
model.fit(X, y)

In [None]:
train_acc = model.score(X, y)

In [None]:
print(f"Training accuracy: {round(train_acc, 4)}")

## 5. Model Class

In [None]:
class Predictor(VertaModelBase):
    def __init__(self, artifacts):
        self.model = cloudpickle.load(open(artifacts['serialized_model'], 'rb'))
        
    @verify_io
    def predict(self, data):
        results = []

        for item in data:
            results.append(self.model.predict(item).tolist())
        
        return results

    def describe(self):
        return {
            'method': 'predict',
            'args': 'age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country',
            'returns': 'income_label',
            'description': 'Predicts whether a person has >50k income based on census data.',
            'input_description': 'Batch of census information, one sample per entry.',
            'output_description': 'String indicating whether the person earns more than 50k a year.'
        }
        
    def example(self):
        return [[
            [49, 4, 160187, 6, 5, 3, 8, 1, 2, 0, 0, 0, 16, 23],
            [52, 6, 209642, 11, 9, 2, 4, 0, 4, 1, 0, 0, 45, 39],
            [31, 4, 45781, 12, 14, 4, 10, 1, 4, 0, 14084, 0, 50, 39],
            [42, 4, 159449, 9, 13, 2, 4, 0, 4, 1, 5178, 0, 40, 39],
            [37, 4, 280464, 15, 10, 2, 4, 0, 2, 1, 0, 0, 80, 39]
        ]]

## 6. Model Test

In [None]:
artifacts = {'serialized_model': 'model.pkl'}

In [None]:
predictor = Predictor(artifacts)

In [None]:
predictor.predict(predictor.example())

## 7. Model Register

In [None]:
with open('model.pkl', 'wb') as f:
    cloudpickle.dump(model, f)

In [None]:
registered_model = client.get_or_create_registered_model(name=MODEL_NAME)

In [None]:
model = registered_model.create_standard_model(
    name = VERSION,
    model_cls = Predictor,
    environment = Python(requirements=['scikit-learn']),
    model_api = ModelAPI(X, y.to_frame(name = 'income_label')),
    artifacts = artifacts
)

## 8. Model Endpoint

In [None]:
endpoint = client.get_or_create_endpoint(ENDPOINT_NAME)

In [None]:
endpoint.update(model, wait=True)

## 9. Predictions

In [None]:
deployed_model = endpoint.get_deployed_model()

In [None]:
deployed_model.predict([X.values.tolist()[5:10]])

In [None]:
# Uncomment the line below if you want to delete the created endpoint:
# endpoint.delete()