In [2]:
# Import all the required files
import pureml
from pureml.decorators import load_data,transformer,dataset,model
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings

warnings.simplefilter("ignore")
rand_seed = 1234
np.random.seed(rand_seed)

In [2]:
@load_data()
def load_dataset():
    df = pd.read_excel(io='default of credit card clients.xls',header  = 1)

    return df

In [3]:
@transformer()
def remove_columns(df):
    return df.drop(['ID'],axis =1)

In [4]:
@transformer()
def rename_columns(df):
    return df.rename(columns={"PAY_0": "PAY_1","default payment next month":"default"})    

In [5]:
@transformer()
def dataset_imbalances(df):
    categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]

    for col_name in categorical_features:
        df[col_name] = df[col_name].astype("category")

    Y, A_str = df.loc[:, "default"], df.loc[:, "SEX"]
    X = pd.get_dummies(df.drop(columns=["default", "SEX"]))

    #A_str = A.map({1: "male", 2: "female"})

    A_str.value_counts(normalize=True)
    Y.value_counts(normalize=True)
    
    # Generate "Interest" column as a DataFrame_str
    interest_values = np.random.normal(loc=2 * Y, scale=A_str)
    interest_column = pd.DataFrame(interest_values, columns=["Interest"])

    # Concatenate "Interest" column with X DataFrame
    X = pd.concat([X, interest_column], axis=1)

    return {'X':X,'Y':Y,'A_str':A_str}


In [6]:
@transformer()
def resample_training_data(X_train, Y_train, A_train):
   
    negative_ids = Y_train[Y_train == 0].index
    positive_ids = Y_train[Y_train == 1].index
    balanced_ids = positive_ids.union(
        np.random.choice(a=negative_ids, size=len(positive_ids)))

    X_train = X_train.loc[balanced_ids, :]
    Y_train = Y_train.loc[balanced_ids]
    A_train = A_train.loc[balanced_ids]
    return  {"X_train": X_train, "Y_train":Y_train, "A_train": A_train}

In [8]:
from pureml.decorators import dataset

@dataset(label='credit_datasetexample_final_desc:main',upload=True)
def create_dataset():
    df = load_dataset()
    df = remove_columns(df)
    df = rename_columns(df)
    data  = dataset_imbalances(df)
    X,Y,A_str = data['X'],data['Y'],data['A_str']
    X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(X, Y, A_str, test_size=0.35, stratify=Y)
    data = resample_training_data(X_train, y_train, A_train)
    X_train, y_train, A_train = data['X_train'],data['Y_train'],data['A_train']
    A_test = A_test.to_numpy()
    y_test = y_test.to_numpy()
    print(A_test)
    #return {"sensitive_features" : A_test}
    return {"x_train":X_train,"y_train":y_train,"x_test":X_test,"y_test":y_test,"sensitive_features" : A_test}
    #return {"x_train":X_train,"y_train":y_train,"x_test":X_test,"y_test":y_test}


create_dataset()

[2 1 2 ... 2 2 2]


{'x_train':        LIMIT_BAL  AGE  PAY_1  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6  BILL_AMT1  \
 0          20000   24      2      2     -1     -1     -2     -2       3913   
 1         120000   26     -1      2      0      0      0      2       2682   
 3          50000   37      0      0      0      0      0      0      46990   
 9          20000   35     -2     -2     -2     -2     -1     -1          0   
 10        200000   34      0      0      2      0      0     -1      11073   
 ...          ...  ...    ...    ...    ...    ...    ...    ...        ...   
 29983      20000   44     -2     -2     -2     -2     -2     -2       1822   
 29991     210000   34      3      2      2      2      2      2       2500   
 29994      80000   34      2      2      2      2      2      2      72557   
 29997      30000   37      4      3      2     -1      0      0       3565   
 29999      50000   46      0      0      0      0      0      0      47929   
 
        BILL_AMT2  ...  EDUCATION_2  ED

In [9]:
data = pureml.dataset.fetch('credit_datasetexample_final_desc:main:v1')
x_test = data['x_test']
y_test = data['y_test']


In [10]:
a_test = data["sensitive_features"]

In [11]:
# print(x_test.shape,y_test.shape,a_test.shape)
print(a_test)

[2 1 2 ... 2 2 2]


In [4]:
@model(label='credit_modelexample_final_desc:main')
def create_model():
    data = pureml.dataset.fetch('credit_datasetexample_final_desc:main:v1')
    x_train = data['x_train']
    y_train = data['y_train']
    lgb_params = {
    #"objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 10,
    "max_depth": 3,
    "random_state": rand_seed,
    "n_jobs": 1,}

    pureml.log(params=lgb_params)
    estimator = Pipeline(
        steps=[
            ("preprocessing", StandardScaler()),
            ("classifier", lgb.LGBMClassifier(**lgb_params)),
        ]
    )

    estimator.fit(x_train, y_train)
    return estimator

create_model()

[LightGBM] [Info] Number of positive: 4313, number of negative: 4313
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3514
[LightGBM] [Info] Number of data points in the train set: 8626, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


No metrics are found in config


No figures are found in config


In [5]:
pureml.model.fetch(label='credit_modelexample_final_desc:main:v2')

In [6]:
pureml.predict.add(label ='credit_modelexample_final_desc:main:v2',paths={'predict':'predict.py'})

In [7]:
pureml.predict.fetch(label='credit_modelexample_final_desc:main:v2')

In [1]:
import pureml
pureml.eval(task_type=['classification','fairness'],label_model='credit_modelexample_final_desc:main:v2',label_dataset='credit_datasetexample_final_desc:main:v1')



{'complete': {'performance': {'accuracy': {'value': 0.7839047619047619,
    'severity': 'moderate',
    'threshold': 0.8,
    'summary': 'accuracy has a value of 0.78 with moderate as Severity Index against the 0.8 as threshold ',
    'description': 'This test checks the Accuracy metric to see both if its performance on the evaluation set alone is satisfactory, as well as if performance in terms of Accuracy has degraded from the reference to evaluation set. The key detail displays whether the given performance metric has degraded beyond a defined threshold.',
    'matters': 'During production, factors like distribution shift or a change in <span>p(y|x)</span> may cause model performance to decrease significantly.'},
   'precision': {'value': 0.5073369565217392,
    'severity': 'pass',
    'threshold': 0.5,
    'summary': 'precision has a value of 0.51 with moderate as Severity Index against the 0.5 as threshold ',
    'description': 'This test checks the Precision metric to see both if

In [None]:
import pureml
pureml.__version__