################################################################################
#Licensed Materials - Property of IBM
#(C) Copyright IBM Corp. 2019
#US Government Users Restricted Rights - Use, duplication disclosure restricted
#by GSA ADP Schedule Contract with IBM Corp.
################################################################################

The auto-generated notebooks are subject to the International License Agreement for Non-Warranted Programs (or equivalent) and License Information document for Watson Studio Auto-generated Notebook ("License Terms"), such agreements located in the link below.
Specifically, the Source Components and Sample Materials clause included in the License Information document for
Watson Studio Auto-generated Notebook applies to the auto-generated notebooks. 
By downloading, copying, accessing, or otherwise using the materials, you agree to the License Terms.
http://www14.software.ibm.com/cgi-bin/weblap/lap.pl?li_formnum=L-AMCU-BHU2B7&title=IBM%20Watson%20Studio%20Auto-generated%20Notebook%20V2.1


## IBM AutoAI Auto-Generated Notebook v1.11.2
### Representing Pipeline: P4 from run f8e6ab3f-1ec7-4b03-8085-2a122a771552

**Note**: Notebook code generated using AutoAI will execute successfully.  If code is modified or reordered, there is no guarantee it will successfully execute, please read our documentation for more information https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/autoai-notebook.html .

Before modifying the pipeline or trying to re-fit the pipeline, consider:
The notebook converts dataframes to numpy arrays before fitting the pipeline (a current restriction of the preprocessor pipeline).
The known_values_list is passed by reference and populated with categorical values during fit of the preprocessing pipeline.  Delete its members before re-fitting.


### 1. Set Up

In [None]:
import sklearn
try:
    import xgboost
except:
    print('xgboost, if needed, will be installed and imported later')
try:
    import lightgbm
except:
    print('lightgbm, if needed, will be installed and imported later')
from sklearn.cluster import FeatureAgglomeration
import numpy
from numpy import nan, dtype, mean
import autoai_libs
from autoai_libs.sklearn.custom_scorers import CustomScorers
import sklearn.ensemble
from autoai_libs.cognito.transforms.transform_utils import TExtras, FC
from autoai_libs.transformers.exportable import *
from autoai_libs.utils.exportable_utils import *
from sklearn.pipeline import Pipeline
known_values_list=[]


### 2. Compose Pipeline

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
# Metadata used in retrieving data and computing metrics.  Customize as necessary for your environment.
data_source='replace_with_path_and_csv_filename'
target_label_name = _input_metadata['target_label_name']
learning_type = _input_metadata['learning_type']
optimization_metric = _input_metadata['optimization_metric']
random_state = _input_metadata['random_state']
cv_num_folds = _input_metadata['cv_num_folds']
holdout_fraction = _input_metadata['holdout_fraction']
data_provenance = _input_metadata['data_provenance']
if 'pos_label' in _input_metadata and learning_type == 'classification':
    pos_label = _input_metadata['pos_label']
else:
    pos_label = None


In [None]:
import pandas as pd
csv_encodings=['UTF-8','Latin-1'] # supplement list of encodings as necessary for your data
df = None
if type(data_provenance) is str: # try to load file specified
    for encoding in csv_encodings:
        try:
            df = pd.read_csv(data_provenance, encoding=encoding) # your data file name here
        except Exception as csv_exception:
            print(csv_exception)
if df is None: # try to load file from Cloud Object Storage
    try:
        data_location = data_provenance['input_data'][0]
        print('data_location '+ str(data_location))
        import boto3
        session = boto3.session.Session()
        cos = session.client(
            service_name='s3',
            aws_access_key_id=data_location['connection']['access_key_id'],
            aws_secret_access_key=data_location['connection']['secret_access_key'],
            endpoint_url=data_location['connection']['endpoint_url'],
            verify=False
        )
        local_path = data_location['location']['path']
        print('local_path ' + str(local_path))
        cos.download_file(data_location['location']['bucket'],
                     data_location['location']['path'],
                     local_path)
        for encoding in csv_encodings:
            try:
                df = pd.read_csv(local_path, encoding=encoding) # your data file name here
            except Exception as csv_exception:
                print(csv_exception)
    except Exception as e:
        print(e)
if df is None:
    raise(ValueError('Problem accessing or decoding csv data. May need csv_encoding string, or location or credential information to read dataframe from COS'))


In [None]:
# Drop rows whose target is not defined
target = target_label_name # your target name here
if learning_type == 'regression':
    df[target] = pd.to_numeric(df[target], errors='coerce')
df.dropna('rows', how='any', subset=[target], inplace=True)
df_y = df[target]


In [None]:
df_X = df.drop(columns=[target])

In [None]:
# split pipeline into preprocessing pipeline (which needs to see all data for e.g. known categories) and remainder
preprocessor_index = -1
preprocessing_steps = [] 
for i, step in enumerate(pipeline.steps):
    preprocessing_steps.append(step)
    if step[0]=='preprocessor':
        preprocessor_index = i
        break
if preprocessor_index >= 0:
    preprocessing_pipeline = Pipeline(memory=pipeline.memory, steps=preprocessing_steps)
    pipeline = Pipeline(steps=pipeline.steps[preprocessor_index+1:])

In [None]:
#delete CONTENTS of known_values_list before refitting, cloning or cross_validate-ing the pipeline, or previous values will be used.
for i in range(len(known_values_list)):
    del known_values_list[0]

In [None]:
# Preprocess X
# preprocessor should see all data for cross_validate on the remaining steps to match autoai scores
preprocessing_pipeline.fit(df_X.values)
X_prep = preprocessing_pipeline.transform(df_X.values)

In [None]:
# determine learning_type and perform holdout split (stratify conditionally)
if learning_type is None:
    # When the problem type is not available in the metadata, use the sklearn type_of_target to determine whether to stratify the holdout split
    # Caution:  This can mis-classify regression targets that can be expressed as integers as multiclass, in which case manually override the learning_type
    from sklearn.utils.multiclass import type_of_target
    if type_of_target(df_y.values) in ['multiclass', 'binary']:
        learning_type = 'classification'
    else:
        learning_type = 'regression'
    print('learning_type determined by type_of_target as:',learning_type)
else:
    print('learning_type specified as:',learning_type)
    
from sklearn.model_selection import train_test_split
if learning_type == 'classification':
    X, X_holdout, y, y_holdout = train_test_split(X_prep, df_y.values, test_size=holdout_fraction, random_state=random_state, stratify=df_y.values)
else:
    X, X_holdout, y, y_holdout = train_test_split(X_prep, df_y.values, test_size=holdout_fraction, random_state=random_state)


In [None]:
# create a function to produce a scorer for a given positive label
def pos_label_scorer(scorer, pos_label):
    kwargs = {'pos_label':pos_label}
    for prop in ['needs_proba', 'needs_threshold']:
        if prop+'=True' in scorer._factory_args():
            kwargs[prop] = True
    if scorer._sign == -1:
        kwargs['greater_is_better'] = False
    from sklearn.metrics import make_scorer
    scorer=make_scorer(scorer._score_func, **kwargs)
    return scorer

In [None]:
# fit the remainder of the pipeline on the training data
pipeline.fit(X,y)

In [None]:
# predict on the holdout data
y_pred = pipeline.predict(X_holdout)

In [None]:
# compute score for the optimization metric
# scorer may need pos_label, but not all scorers take pos_label parameter
from sklearn.metrics import get_scorer
scorer = get_scorer(optimization_metric)
score = None
#score = scorer(pipeline, X_holdout, y_holdout)  # this would suffice for simple cases
pos_label = None  # if you want to supply the pos_label, specify it here
if pos_label is None and 'pos_label' in _input_metadata:
    pos_label=_input_metadata['pos_label']
try:
    score = scorer(pipeline, X_holdout, y_holdout)
except Exception as e1:
    if pos_label is None or str(pos_label)=='':
        print('You may have to provide a value for pos_label in order for a score to be calculated.')
        raise(e1)
    else:
        exception_string=str(e1)
        if 'pos_label' in exception_string:
            try:
                scorer = pos_label_scorer(scorer, pos_label=pos_label)
                score = scorer(pipeline, X_holdout, y_holdout)
                print('Retry was successful with pos_label supplied to scorer')
            except Exception as e2:
                print('Initial attempt to use scorer failed.  Exception was:')
                print(e1)
                print('')
                print('Retry with pos_label failed.  Exception was:')
                print(e2)
        else:
            raise(e1)

if score is not None:
    print(score)

In [None]:
# cross_validate pipeline using training data
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, KFold
if learning_type == 'classification':
    fold_generator = StratifiedKFold(n_splits=cv_num_folds, random_state=random_state)
else:
    fold_generator = KFold(n_splits=cv_num_folds, random_state=random_state)
cv_results = cross_validate(pipeline, X, y, cv=fold_generator, scoring={optimization_metric:scorer})
import numpy as np
np.mean(cv_results['test_' + optimization_metric])

In [None]:
cv_results