In [1]:
import os
import pandas as pd
import mlflow
from sklearn.compose import make_column_selector
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelBinarizer
import great_expectations as gx

print("current directory:", os.getcwd())

  from .autonotebook import tqdm as notebook_tqdm


current directory: /Users/achillejuniormbogoltouye/Documents/mlops_training_esme/notebook


In [6]:
# load dataset
df = pd.read_csv(
    "/Users/achillejuniormbogoltouye/Documents/mlops_training_esme/datalake/raw_data/census.csv"
    )

In [None]:
mlflow.get_artifact_uri

In [64]:
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlgt           32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [66]:
cat_cols = make_column_selector(dtype_include="object")(df)
print("categorical columns:", cat_cols)

categorical columns: [' workclass', ' education', ' marital-status', ' occupation', ' relationship', ' race', ' sex', ' native-country', ' salary']


In [68]:
num_cols = make_column_selector(dtype_include="number")(df)
print("numerical columns:", num_cols)

numerical columns: ['age', ' fnlgt', ' education-num', ' capital-gain', ' capital-loss', ' hours-per-week']


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type(


In [6]:
df.isna().sum()

age                0
 workclass         0
 fnlgt             0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 salary            0
dtype: int64

In [7]:
df.describe()

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type(


Unnamed: 0,age,fnlgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


### Data quality issues

In [56]:
# create data context
data_context=gx.get_context()
data_source = data_context.data_sources.add_pandas(name="census_data_source")
data_asset = data_source.add_dataframe_asset(name="census_data_asset")
batch_definition = data_asset.add_batch_definition(name="census_data_batch")
batch_data=batch_definition.get_batch(batch_parameters={"dataframe": df})


In [61]:
# define expectations
expectation = gx.expectations.ExpectColumnMaxToBeBetween(
    column="age",
    min_value=1,
    max_value=90
)
# run expectation
validation_result = batch_data.validate(expectation)

Calculating Metrics: 100%|██████████| 4/4 [00:00<00:00, 653.37it/s] 


In [62]:
assert validation_result.success ==True, "Validation failed: Age values are not between 1 and 90."



### data drift with evidently

In [19]:
import evidently
from evidently import DataDefinition, Dataset, Report #ColumnMapping

In [None]:
DataDefinition()

[0;31mInit signature:[0m
[0mDataDefinition[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mid_column[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimestamp[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnumerical_columns[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcategorical_columns[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_columns[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdatetime_columns[0m[0;34m:[0m 

In [130]:
df[df[" marital-status"]== " Married-civ-spouse"][" salary"].values

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' >50K', ' >50K'],
      dtype=object)

In [129]:
df[df[" marital-status"] == " Married-civ-spouse"]

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32552,43,Private,84661,Assoc-voc,11,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,<=50K
32554,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


In [4]:
df.tail()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [5]:
# Stripping whitespace from column names
df.columns = df.columns.str.strip()

In [12]:
df.columns

Index(['age', 'workclass', 'fnlgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [13]:
df.drop(columns=["salary"],)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [7]:
df.isna().sum()

age               0
workclass         0
fnlgt             0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [94]:
df.duplicated().sum()

24

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlgt           32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [9]:
df[df.columns[-1]].value_counts(normalize=True)

salary
<=50K    0.75919
>50K     0.24081
Name: proportion, dtype: float64

In [65]:
cat_features =['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country'
    ]
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, StandardScaler

def process_data(
    X,
    categorical_features=[],
    label=None,
    training=True,
    lb=None,
    encoder=None,
    scaler=None,
):

    # Stripping whitespace from column names
    X.columns = X.columns.str.strip()

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)
    if training:
        scaler = StandardScaler()
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        lb = LabelBinarizer()

        X_continuous = scaler.fit_transform(X_continuous)
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
        
        

    return X_categorical, X_continuous, y

In [66]:
import numpy as np

In [68]:
 x_categorical, X_continuous, y = process_data(
    df,
    categorical_features=cat_features,
    label="salary",
    training=True,
    lb=None,
    encoder=None,
    scaler=None,
)

In [69]:
x

Unnamed: 0,age,fnlgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
32556,27,257302,12,0,0,38
32557,40,154374,9,0,0,40
32558,58,151910,9,0,0,40
32559,22,201490,9,0,0,20


In [55]:
x_categorical

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [38]:
 X_continuous

array([[ 0.03067056, -1.06361075,  1.13473876,  0.1484529 , -0.21665953,
        -0.03542945],
       [ 0.83710898, -1.008707  ,  1.13473876, -0.14592048, -0.21665953,
        -2.22215312],
       [-0.04264203,  0.2450785 , -0.42005962, -0.14592048, -0.21665953,
        -0.03542945],
       ...,
       [ 1.42360965, -0.35877741, -0.42005962, -0.14592048, -0.21665953,
        -0.03542945],
       [-1.21564337,  0.11095988, -0.42005962, -0.14592048, -0.21665953,
        -1.65522476],
       [ 0.98373415,  0.92989258, -0.42005962,  1.88842434, -0.21665953,
        -0.03542945]])

In [39]:

y 

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
#profile = ProfileReport(df, title="Profiling Report", explorative=True)
#profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
categorical_columns=make_column_selector(dtype_include=object)(df)
categorical_columns

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'salary']

In [11]:
numerical_columns=make_column_selector(dtype_exclude=object)(df)
numerical_columns

['age',
 'fnlgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [113]:
from pydantic import BaseModel, Field

class Input_Features(BaseModel):
    # Define the input features with their types
    age: int
    salary: str 
    workclass: str
    fnlgt: int
    education: str
    education_num: int = Field(alias="education-num")
    marital_status: str = Field(alias="marital-status")
    occupation: str
    relationship: str
    race: str
    sex: str
    capital_gain: int   = Field(alias="capital-gain")
    capital_loss: int   = Field(alias="capital-loss")
    hours_per_week: int = Field(alias="hours-per-week")
    native_country: str = Field(alias="native-country")

    class Config:
        populate_by_name = True  # Allows aliasing (useful for column names with dashes)

In [100]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(
    df, test_size=0.20, random_state=42, shuffle=True, stratify=df["salary"]
)

In [114]:
input_data=df.iloc[0].to_dict()
input_data

{'age': 39,
 'workclass': ' State-gov',
 'fnlgt': 77516,
 'education': ' Bachelors',
 'education-num': 13,
 'marital-status': ' Never-married',
 'occupation': ' Adm-clerical',
 'relationship': ' Not-in-family',
 'race': ' White',
 'sex': ' Male',
 'capital-gain': 2174,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': ' United-States',
 'salary': ' <=50K'}

In [116]:
Input_Features(**input_data)

Input_Features(age=39, salary=' <=50K', workclass=' State-gov', fnlgt=77516, education=' Bachelors', education_num=13, marital_status=' Never-married', occupation=' Adm-clerical', relationship=' Not-in-family', race=' White', sex=' Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native_country=' United-States')

In [124]:
input_df = pd.DataFrame([Input_Features(**input_data).model_dump(by_alias=True)])

In [120]:
input_df

Unnamed: 0,age,salary,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,<=50K,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States


In [121]:
Input_Features(**input_data).model_dump(by_alias=True)

{'age': 39,
 'salary': ' <=50K',
 'workclass': ' State-gov',
 'fnlgt': 77516,
 'education': ' Bachelors',
 'education-num': 13,
 'marital-status': ' Never-married',
 'occupation': ' Adm-clerical',
 'relationship': ' Not-in-family',
 'race': ' White',
 'sex': ' Male',
 'capital-gain': 2174,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': ' United-States'}

In [111]:
pd.DataFrame.from_dict(Input_Features(**input_data).model_dump())

ValueError: If using all scalar values, you must pass an index

In [44]:
df.head(3)

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [45]:
df.iloc[0].to_dict()

{'age': 39,
 'workclass': ' State-gov',
 'fnlgt': 77516,
 'education': ' Bachelors',
 'education_num': 13,
 'marital_status': ' Never-married',
 'occupation': ' Adm-clerical',
 'relationship': ' Not-in-family',
 'race': ' White',
 'sex': ' Male',
 'capital_gain': 2174,
 'capital_loss': 0,
 'hours_per_week': 40,
 'native_country': ' United-States',
 'salary': ' <=50K'}

In [23]:
df.head(3).pop("salary")

0     <=50K
1     <=50K
2     <=50K
Name: salary, dtype: object

In [72]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown="ignore")
X = [["Male", 1], ["Female", 3], ["Female", 2]]


pd.DataFrame(X)

Unnamed: 0,0,1
0,Male,1
1,Female,3
2,Female,2


In [None]:
enc.fit(X)
enc.categories_
enc.transform([["Female", 1], ["Male", 4]]).toarray()
enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
enc.get_feature_names_out(["gender", "group"])

In [43]:
X=df.drop(labels="salary", axis=1)
y=df["salary"]

In [66]:
df["native-country"].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1, random_state=42, shuffle=True,stratify=y)

In [55]:
X_train.shape

(29304, 14)

In [None]:
categorical_features = make_column_selector(dtype_include=object)(df)

['_ProfileReport__initialize_dataframe',
 '_ProfileReport__validate_inputs',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_description_set',
 '_df_hash',
 '_html',
 '_json',
 '_render_html',
 '_render_json',
 '_render_widgets',
 '_report',
 '_repr_html_',
 '_sample',
 '_summarizer',
 '_type_schema',
 '_typeset',
 '_widgets',
 'compare',
 'config',
 'description_set',
 'df',
 'df_hash',
 'dump',
 'dumps',
 'get_description',
 'get_duplicates',
 'get_rejected_variables',
 'get_sample',
 'html',
 'invalidate_cache',
 'json',
 'load',
 'loads',
 'report',
 'summarizer',
 'to_expectation_suite',
 'to_file',
 'to_html',
 'to_json',
 '