In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import pickle

import src.config.column_names as col
import src.config.base as base
from src.infrastructure.build_dataset import DataBuilderFactory, DataMerger
from src.domain.cleaning import MissingValueTreatment
from interpretability_utilities import plot_features_importance

In [None]:
# Preprocessing client data
client_builder = DataBuilderFactory(client_file, 
                                    config_client_data, 
                                    ALL_CLIENT_DATA_TRANSLATION)
client_data = client_builder.transform('client')

# Preprocessing eco data
eco_builder = DataBuilderFactory(eco_file, 
                                 config_eco_data)
eco_data = eco_builder.transform('eco')

# Merging files to final dataset
X, y = DataMerger(client_data, eco_data, MERGER_FIELD).transform()
y = y.eq('Yes').astype(int)

In [10]:
# Build datasets

client_builder = DataBuilderFactory(base.TRAIN_CLIENT_DATA_PATH, base.config_client_data, base.ALL_CLIENT_DATA_TRANSLATION)
client_data = client_builder.transform('client')

eco_builder = DataBuilderFactory(base.TRAIN_ECO_DATA_PATH, base.config_eco_data)
eco_data = eco_builder.transform('eco')

# Merging files to final dataset
X, y = DataMerger(client_data, eco_data, col.MERGER_FIELD).transform()
y = y.eq('Yes').astype(int)

- Casting types.
- Translating French words to English.
- Dropping rows with too many missing values.
- Correcting erroneous entries.
- Casting types.
- Translating French words to English.
- Dropping rows with too many missing values.
- Imputing missing data.


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=base.SEED)

In [20]:
clf = pickle.load(open(base.SAVED_MODEL_PATH, "rb"))

In [23]:
y_pred = clf.best_estimator_.predict(X_test)

In [30]:
X_train.columns

Index(['DATE', 'AGE', 'JOB_TYPE', 'STATUS', 'EDUCATION', 'HAS_DEFAULT',
       'BALANCE', 'HAS_HOUSING_LOAN', 'HAS_PERSO_LOAN', 'CONTACT',
       'DURATION_CONTACT', 'NB_CONTACT', 'NB_DAY_LAST_CONTACT',
       'NB_CONTACT_LAST_CAMPAIGN', 'RESULT_LAST_CAMPAIGN',
       'EMPLOYMENT_VARIATION_RATE', 'IDX_CONSUMER_PRICE',
       'IDX_CONSUMER_CONFIDENCE'],
      dtype='object')

In [41]:
imp = clf.best_estimator_.named_steps.get('imputation')
fe = clf.best_estimator_.named_steps.get('feature_engineering')

In [47]:
process_pipeline = Pipeline([('imputation', imp),
                     ('feature_engineering', fe)
                     ])

In [49]:
tr = process_pipeline.transform(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a Da

In [50]:
tr

array([[-0.0405, 0.7333333333333333, 0.0, ..., 0.925685505956028,
        0.6923678744588491, -0.3539632583700152],
       [-0.0005, 0.06666666666666667, 0.0, ..., 0.7326111159500192,
        0.8211360521368004, 0.9248722473819695],
       [0.0, 0.3333333333333333, 0.0, ..., 0.925685505956028,
        -0.11073891790108122, 0.9857691762273014],
       ...,
       [0.000125, 0.06666666666666667, 0.0, ..., 0.7326111159500192,
        0.8211360521368004, 0.9248722473819695],
       [0.163375, 0.4, 0.0, ..., -1.9060722141321034,
        -1.3492854689878142, 1.6150374409624049],
       [0.042125, 0.06666666666666667, 0.0, ..., -1.0694165241060645,
        1.1955803582793028, 0.13321217239264646]], dtype=object)

In [64]:
clf.best_estimator_

Pipeline(steps=[('imputation', MissingValueTreatment()),
                ('feature_engineering',
                 ColumnTransformer(transformers=[('balance-clipper',
                                                  ClipTransformer(a_max=4000,
                                                                  a_min=-4000),
                                                  ['BALANCE']),
                                                 ('nb-clipper',
                                                  ClipTransformer(a_max=15,
                                                                  a_min=0),
                                                  ['NB_CONTACT',
                                                   'NB_CONTACT_LAST_CAMPAIGN']),
                                                 ('one-hot-encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['STATUS', 'EDUCATION',
                            

In [73]:
fe_output = clf.best_estimator_.named_steps['feature_engineering'].fit_transform(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


In [77]:
fe_output[0]

array([-0.0405, 0.7333333333333333, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0,
       0.0, 0, 0, 1, 1.0, 0.0, -0.5601547630055554, '7', 1,
       0.925685505956028, 0.6923678744588491, -0.3539632583700152],
      dtype=object)

In [83]:
fe_output[1]

array([-0.0005, 0.06666666666666667, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
       0.0, 0, 0, 0, 1.0, 0.0, -0.3691103015136966, '5', 1,
       0.7326111159500192, 0.8211360521368004, 0.9248722473819695],
      dtype=object)

In [80]:
X_train.head(1)

Unnamed: 0,DATE,AGE,JOB_TYPE,STATUS,EDUCATION,HAS_DEFAULT,BALANCE,HAS_HOUSING_LOAN,HAS_PERSO_LOAN,CONTACT,DURATION_CONTACT,NB_CONTACT,NB_DAY_LAST_CONTACT,NB_CONTACT_LAST_CAMPAIGN,RESULT_LAST_CAMPAIGN,EMPLOYMENT_VARIATION_RATE,IDX_CONSUMER_PRICE,IDX_CONSUMER_CONFIDENCE
15847,2008-07-21,35.0,Manager,Single,Graduate studies,No,-324.0,No,Yes,Portable,985,11,-1,0,Fail,1.4,93.918,-42.7
7111,2008-05-29,37.0,Blue-collar worker,Married,Secondary education,No,-4.0,Yes,Yes,Portable,381,1,-1,0,Fail,1.1,93.994,-36.4
22312,2008-08-22,52.0,Retired,Married,Secondary education,No,0.0,No,No,Portable,351,5,-1,0,Fail,1.4,93.444,-36.1
16424,2008-07-23,53.0,Technician,Divorced,Secondary education,No,2156.0,Yes,No,Portable,671,4,-1,0,Fail,1.4,93.918,-42.7
13919,2008-07-10,31.0,House keeper,Married,Secondary education,No,0.0,Yes,No,Portable,270,1,-1,0,Fail,1.4,93.918,-42.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27669,2008-11-21,31.0,Manager,Married,Graduate studies,No,1402.0,No,No,Portable,104,2,-1,0,Fail,-0.1,93.2,-42.0
4794,2008-05-21,38.0,Services,Married,Secondary education,No,-224.0,Yes,Yes,Portable,222,1,-1,0,Fail,1.1,93.994,-36.4
2843,2008-05-14,42.0,Manager,Married,Graduate studies,No,1.0,No,No,Portable,253,1,-1,0,Fail,1.1,93.994,-36.4
42522,2009-12-18,30.0,Technician,Married,Graduate studies,No,1307.0,No,No,Portable,248,6,-1,0,Fail,-3.0,92.713,-33.0


In [84]:
X_train['EDUCATION'].unique()

array(['Graduate studies', 'Secondary education', 'Primary education'],
      dtype=object)

In [85]:
X_train['STATUS'].unique()

array(['Single', 'Married', 'Divorced'], dtype=object)

In [86]:
X_train['RESULT_LAST_CAMPAIGN'].unique()

array(['Fail', 'Success', 'Other'], dtype=object)