# New Start

In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
import ppscore as pps

dataset= 'CreditModellingTestCase.csv'

df = pd.read_csv('D:/Work/Libraries/Testing data/' + dataset, sep=';')
df = df.loc[df['default'].notnull()]

# Feature types
y = 'default'
id_feats = 'uuid'
x = df.columns.drop(y).drop(id_feats)
df[y] = df[y].astype('category').cat.codes

# Split
x_train, x_test, y_train, y_test = train_test_split(df[x], df[y], test_size=0.2)

numeric_features = x_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = x_train.select_dtypes(include=['object']).columns

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import category_encoders as ce # Categorical encoding methods
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
label_encoder = le.fit(y_train)
y_train = label_encoder.transform(y_train)

# Steps:
# 0. split into train, validation and test
# 1. remove constant features
# 2. remove duplicated features i.e. correlation = 1
# 3. remove all missing features
# 4. format features i.e. bool to int
# 5. identify features by type
# 6. date time feature encodings
# 7. imputation (all)
# 8. outliers (numeric)
# 9. tranforms (numeric)
# 10. categorical interactions
# 11. categorical encodings
# 12. kmeans encodings (numeric)
# 13. polynomial features (numeric)
# 14. scaler (numeric)
# 15. pca features (numeric)

num_transform = Pipeline([
    ('impute', SimpleImputer(strategy = 'median', add_indicator=False)),
    ('scale', StandardScaler())
])

cat_transform = Pipeline([
    ('impute', SimpleImputer(strategy = 'most_frequent', add_indicator=False)),
    ('encoder', ce.one_hot.OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform, numeric_features),
        ('cat', cat_transform, categorical_features)
    ])

#preprocessor.fit(x_train, y_train) 

#pipeline = Pipeline([
#    ('preprocessor', preprocessor),
#    ('classifier', RandomForestClassifier(n_estimators=500))
#])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

pipeline.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('impute',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                              

In [83]:
#x_train = pipeline.transform(x_train)
x_test = pipeline.transform(x_test)

In [51]:
from sklearn.metrics import f1_score
y_pred = model.predict(x_train)
print(f1_score(y_train, y_pred, average='macro'))

y_pred = model.predict(x_test)
print(f1_score(y_test, y_pred, average='macro'))

1.0
0.5697170413984974


# New End

In [11]:
import pandas as pd
import numpy as np

dataset= 'CreditModellingTestCase.csv'

df=pd.read_csv('D:/Work/Libraries/Testing data/' + dataset, sep=';')

if dataset == 'CreditModellingTestCase.csv':
    df['datetime_feature'] = pd.date_range(start='01-01-1900', periods = df.shape[0]) # For testing
    df['time_partition_feature'] = pd.date_range(start='01-01-1900', periods = df.shape[0]) # For testing
    df = df.loc[df['default'].notnull()]

    df['name_in_email'] = np.where(df['name_in_email'] == 'no_match', np.NaN, df['name_in_email']) # To test imputation

    # Dummy indicator features
    df['has_paid_char'] = np.where(df['has_paid'] == 1, 'yes','NO')
    df['has_paid_logic'] = np.where(df['has_paid'] == 1, True, False)
    
    y = 'default'
    id_features = ['uuid','time_partition_feature']
    
if dataset == 'HR_Analytics.csv':
    y = 'left'
    df['id'] = df.index
    id_features = 'id'

In [13]:
# Import all functions developed so far from this directory
import os
os.chdir("D:/Work/Libraries/yamll")
# Yet another machine learning library
from yamll import *

# Execute functions

In [None]:
# 1. Perform eda on new dataset
eda = explore_df(df)

# 2. Remove problematic features
remove = eda.loc[(eda['constant'] == 1) | (eda['all_missing'] == 1)]['feature']
df = df.drop(remove)
numeric_transforms = eda.loc[(eda['skewness'].abs() >= 6)]['feature']

# 2.1 Remove duplicated features
#df = drop_correlated_features(df, correlation_cutoff=1)

# 3. Format features 
df = format_features(df)

# 4. Partition data into train, valid and test
train, valid, test = partition_data(df=df, y=y)
df = combine_partitions(train, valid, test)

# 5. Detect feature types
numeric, categorical, datetime, flag = get_feature_types(df = df, y = y, id = ['split_ind','uuid','time_partition_feature'])
x_features = numeric + categorical + flag + datetime

# 6. Pre-processing
# 6.1 Datetime features
df = apply_datetime_encoding(df=df, x=datetime)

# 6.2 Imputation
#  - new functions created to avoid constant feature creation for tracking features
d_imputation = map_imputation_encoding(df.loc[df['split_ind'] == 'train'].copy(), x = x_features)
df = apply_imputation_encoding(df=df, mapping_table=d_imputation, tracking_flags=False)

# 6.3 Outlier clipping
d_outlier_encodings = map_outlier_encoding(df=df, x=numeric)
df = apply_outlier_encoding(df=df, mapping_table=d_outlier_encodings, method='percentile')

# 6.4 Numerical transforms
numeric_transforms = numeric_transforms[numeric_transforms.isin(x_features)]
df = apply_numeric_transforms(df=df, x=numeric_transforms)

# 6.5 Categorical interactions
#  - ToDo! Duplicate features are created in mapping table, need to create unique mappings then only apply
d_categorical_interactions = map_categorical_interactions(categorical)
df = apply_categorical_interactions(df, d_categorical_interactions)

# 6.6 Categorical encoding
cf = categorical + list('interaction_' + d_categorical_interactions['base_feature'] + '_' + d_categorical_interactions['interacted_feature']) # Inlcude categorical interaction features
d_categorical_encoding = map_categorical_encoding(df=df.loc[df['split_ind'] == 'train'].copy(), x = cf, y=y)
df = apply_categorical_encoding(df=df, mapping_table=d_categorical_encoding, encode_mode='onehot', tracking_flags=False)

# 6.7 Kmeans features
d_kmeans_encoding = map_kmeans_encoding(df=df, x=numeric, clusters=5, sample_size=0.3, seed=1234)
df = apply_kmeans_encoding(df=df, mapping_table=d_kmeans_encoding, encode_type='distance_to_center')

print("Done")

In [None]:
check = pd.DataFrame()
check['missing'] = df.isnull().sum()
check.loc[check['missing'] > 0]

In [None]:
#df.to_csv('D:/Work/Libraries/yamll/test.csv')

# Development of new functions

In [None]:
y = 'default'
pipeline = make_pipeline("test")

#def preprocess(df, x, y=None, pipeline, mapping_list=None):


mapping_list = {
    "d_imputation":d_imputation,
    "d_outlier_encodings":d_outlier_encodings,
    "d_categorical_interactions":d_categorical_interactions,
    "d_categorical_encoding":d_categorical_encoding,
    "d_kmeans_encoding":d_kmeans_encoding
}

In [None]:
pipeline


