In [41]:
import pandas as pd
from src.utils import load_artifact, process_data, get_cat_features
from src.model import inference
from pydantic import BaseModel
from typing_extensions import Literal
import numpy as np

# Cleaning data

In [78]:
df = pd.read_csv('data/census.csv')

In [79]:
df.columns

Index(['age', ' workclass', ' fnlgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' salary'],
      dtype='object')

In [80]:
df.columns = df.columns.map(lambda x: x.replace('-','_').strip() if '-' in x else x.strip())


In [81]:
df['salary'] = df.salary.map(lambda x: 1 if '>' in x else 0)
df = df[df['native_country'].map(lambda x: True if '?' not in x else False)]
df['sex'] = df.sex.map(lambda x: 1 if x=='Male' in x else 0)

In [82]:
df.to_csv('data/census.csv', index=False)

In [83]:
model = load_artifact("model/model.pkl")
encoder = load_artifact("model/encoder.pkl")
lb = load_artifact("model/lb.pkl")

In [84]:
cat_features = get_cat_features()

In [85]:
X, _, _, _ = process_data(
        df, categorical_features=cat_features, encoder=encoder, lb=lb, training=False)

In [89]:
X.shape

(31978, 107)

In [87]:
inference(model, X)

ValueError: X has 107 features, but RandomForestClassifier is expecting 106 features as input.

In [21]:
model.fit(X, y)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [32]:
class ModelInput(BaseModel):
    age: int
    workclass: Literal['State-gov',
                       'Self-emp-not-inc',
                       'Private',
                       'Federal-gov',
                       'Local-gov',
                       'Self-emp-inc',
                       'Without-pay']
    fnlgt: int
    education: Literal[
        'Bachelors', 'HS-grad', '11th', 'Masters', '9th',
        'Some-college',
        'Assoc-acdm', '7th-8th', 'Doctorate', 'Assoc-voc', 'Prof-school',
        '5th-6th', '10th', 'Preschool', '12th', '1st-4th']
    education_num: int
    marital_status: Literal["Never-married",
                            "Married-civ-spouse",
                            "Divorced",
                            "Married-spouse-absent",
                            "Separated",
                            "Married-AF-spouse",
                            "Widowed"]
    occupation: Literal["Tech-support",
                        "Craft-repair",
                        "Other-service",
                        "Sales",
                        "Exec-managerial",
                        "Prof-specialty",
                        "Handlers-cleaners",
                        "Machine-op-inspct",
                        "Adm-clerical",
                        "Farming-fishing",
                        "Transport-moving",
                        "Priv-house-serv",
                        "Protective-serv",
                        "Armed-Forces"]
    relationship: Literal["Wife", "Own-child", "Husband",
                          "Not-in-family", "Other-relative", "Unmarried"]
    race: Literal["White", "Asian-Pac-Islander",
                  "Amer-Indian-Eskimo", "Other", "Black"]
    sex: Literal["Female", "Male"]
    capital_gain: int
    capital_loss: int
    hours_per_week: int
    native_country: Literal[
        'United-States', 'Cuba', 'Jamaica', 'India', 'Mexico',
        'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany', 'Iran',
        'Philippines', 'Poland', 'Columbia', 'Cambodia', 'Thailand',
        'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
        'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
        'Italy', 'China', 'South', 'Japan', 'Yugoslavia', 'Peru',
        'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
        'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
        'Holand-Netherlands']
    
    class Config:
        schema_extra = {
            "example": {
                "age": 27,
                "workclass": 'State-gov',
                "fnlgt": 77516,
                "education": 'Bachelors',
                "education_num": 13,
                "marital_status": "Never-married",
                "occupation": "Tech-support",
                "relationship": "Unmarried",
                "race": "White",
                "sex": "Female",
                "capital_gain": 2000,
                "capital_loss": 0,
                "hours_per_week": 35,
                "native_country": 'United-States'
            }
        }
        

In [33]:
def predict(input: ModelInput):
    input_data = np.array([
        [
            input.age,
            input.workclass,
            input.fnlgt,
            input.education,
            input.education_num,
            input.marital_status,
            input.occupation,
            input.relationship,
            input.race,
            input.sex,
            input.capital_gain,
            input.capital_loss,
            input.hours_per_week,
            input.native_country
        ]
    ])
    
    original_cols = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
        "native_country"]
    
    input_df = pd.DataFrame(data=input_data, columns=original_cols)
    cat_features = get_cat_features()
    
    X, _, _, _ = process_data(
        input_df, categorical_features=cat_features, encoder=encoder, lb=lb, training=False)
    y = inference(model, X)
    pred = lb.inverse_transform(y)[0]

    return {"Income prediction": pred}

In [49]:
original_cols = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
        "native_country"]

In [55]:
len(original_cols)

14

In [54]:
input_data.shape

(1, 14)

In [59]:
input_data = np.array([
        [
            new_ipt.age,
            new_ipt.workclass,
            new_ipt.fnlgt,
            new_ipt.education,
            new_ipt.education_num,
            new_ipt.marital_status,
            new_ipt.occupation,
            new_ipt.relationship,
            new_ipt.race,
            new_ipt.sex,
            new_ipt.capital_gain,
            new_ipt.capital_loss,
            new_ipt.hours_per_week,
            new_ipt.native_country
        ]
    ])
input_df = pd.DataFrame(data=input_data, columns=original_cols)

In [61]:
X, _, _, _ = process_data(
        input_df, categorical_features=cat_features, encoder=encoder, lb=lb, training=False)

In [62]:
X.shape

(1, 107)

In [34]:
ipt = {
  "age": 27,
  "workclass": "State-gov",
  "fnlgt": 77516,
  "education": "Bachelors",
  "education_num": 13,
  "marital_status": "Never-married",
  "occupation": "Tech-support",
  "relationship": "Unmarried",
  "race": "White",
  "sex": "Female",
  "capital_gain": 2000,
  "capital_loss": 0,
  "hours_per_week": 33,
  "native_country": "United-States"
}

In [39]:
new_ipt = ModelInput(**ipt)

In [42]:
predict(new_ipt)

ValueError: X has 107 features, but RandomForestClassifier is expecting 109 features as input.

In [32]:
df.columns = df.columns.map(lambda x: x.strip())

In [33]:
df = df.applymap(lambda x: x.strip() if type(x)==str else x)

In [34]:
df.salary.value_counts()

<=50K    24720
>50K      7841
Name: salary, dtype: int64

In [35]:
df['salary'] = df.salary.map(lambda x: 1 if '>' in x else 0)

In [36]:
df['native-country'].value_counts()

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                      

In [37]:
df = df[df['native-country'].map(lambda x: True if '?' not in x else False)]

In [38]:
df.sex.value_counts()

Male      21370
Female    10608
Name: sex, dtype: int64

In [39]:
df = df[df['native-country'].map(lambda x: True if '?' not in x else False)]
df['sex'] = df.sex.map(lambda x: 1 if x=='Male' in x else 0)

In [40]:
df.race.value_counts()

White                 27430
Black                  3028
Asian-Pac-Islander      956
Amer-Indian-Eskimo      311
Other                   253
Name: race, dtype: int64

In [41]:
df.to_csv('data/clean_census.csv')