### Import Libraries

In [None]:
import numpy as np
import pandas as  pd
import os
import matplotlib.pyplot as plt

# Evaluation
from sklearn.metrics import mean_squared_error

!python3 -m pip install scipy==1.2 --upgrade
# !pip3 list
!pip3 uninstall statsmodels -y
!pip3 install statsmodels==0.10.0rc2 --pre --user

In [None]:
!python -m pip install --user --upgrade pip
!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22 tensorflow==2.0 keras==1.2.2 --user

### Install or update the pipelines SDK

In [None]:
# You may need to restart your notebook kernel after updating the kfp sdk
!pip3 install kfp --upgrade
!pip3 install kfp --upgrade --user
!pip install -U kfp

`Restart the kernel before you proceed`

In [None]:
# Restart kernel after the pip install
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

`Check if the install was successful:`

In [None]:
# !which dsl-compile

## Setup

In [None]:
EXPERIMENT_NAME = 'University Admission notebook pipeline'        # Name of the experiment in the UI
BASE_IMAGE = "tensorflow/tensorflow:latest-gpu-py3"    # Base image used for components in the pipeline

### Build the Components

In [None]:
# Import Kubeflow SDK
import kfp # the Pipelines SDK. 
from kfp import compiler
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.components as comp
import os
import subprocess
import json

from kfp.dsl.types import Integer, GCSPath, String
import kfp.notebook

In [None]:
# where the outputs are stored
out_dir = "/home/jovyan/01-University-Admissions/data/out/"

# Create a pipeline Function

## Preprocessing Function

In [None]:
@dsl.python_component(
    name='preprocess_op',
    description='preprocessing function for University admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)

def preprocess(data_path):
    import numpy as np
    import pandas as pd
    import pickle
    import statsmodels.api as sm
    import pylab
    from scipy.stats import kstest, boxcox
    from scipy.special import inv_boxcox
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    
    # Get data
    DATA_PATH = "https://raw.githubusercontent.com/HamoyeHQ/01-University-Admissions/master/data/"

    def load_admission_data(admission_path=DATA_PATH):
        csv_path = os.path.join(admission_path, "Admission_Predict_Ver1.1.csv")
        return pd.read_csv(csv_path)
    
    # load data from function
    data = load_admission_data()
    
    
    # Splitting into train and test
    # Use the first 400 dataset for training and validation
    train = data.iloc[:400, :]
    # Use the last 100 dataset for testing and evaluation
    test = data.iloc[400:, :]


    # Splitting into features and Targets
    X = train.drop(['Chance of Admit '], axis=1)
    y = train['Chance of Admit ']

    # Selecting only numeric features to test for normality
    num_df = train.drop(['University Rating', 'Research', 'Chance of Admit '], axis=1)
    
    # check for normality using QQ-Plot
    for cols in num_df.columns:
        print(f'Q-Q Plot for {cols}')
        sm.qqplot(num_df[cols], line='s')
        pylab.show()
    
    # #Kolmogorov Smirnov Test for Normality
    feature_list = list(num_df.columns)
    for feature in feature_list:
        output = kstest(num_df[feature], 'norm', N=100)
        print(f'Kolmogorov Test for {feature} = {output}')

    #output file to path
    # NPZ is a file format by numpy that provides storage of array data using gzip compression. 
    # This imageio plugin supports data of any shape, and also supports multiple images per file.
    np.savez_compressed(f'{data_path}/preprocessed-data.npz', 
                       train=train,
                       test=test,
                       X=X,
                       y=y)
    print("Preprocessing Done")

## Transformation Function

### Transform the data for training

In [None]:
@dsl.python_component(
    name='transform_op',
    description='transformation pipeline function for University admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)

def transform(data_path, pipeline_file):
    
    # Install all the dependencies inside the function
    import numpy as np
    import pandas as pd
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    import statsmodels.api as sm
    import pylab
    from scipy.stats import kstest, boxcox
    from scipy.special import inv_boxcox
    from sklearn.preprocessing import PowerTransformer, OneHotEncoder, FunctionTransformer
    from sklearn.pipeline import Pipeline, make_pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.compose import ColumnTransformer, TransformedTargetRegressor, make_column_transformer
    from sklearn.base import TransformerMixin, BaseEstimator
    from mlxtend.feature_selection import ColumnSelector
    from sklearn.metrics import mean_squared_error
    from sklearn.linear_model import BayesianRidge
    
    # A Box Cox transformation is a transformation of a non-normal dependent variables into a normal shape. 
    # Normality is an important assumption for many statistical techniques; 
    # if your data isn’t normal, applying a Box-Cox means that you are able to run a broader number of tests.
    
    #load the preprocessed data
    preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    train = preprocessed_data['train']
    test = preprocessed_data['test']
    X = preprocessed_data['X']
    y = preprocessed_data['y']

    # Function for transforming target variable
    def func(target):
        print('Target Transform Called')
        target_ = target.copy()
       # print(target.shape)
        target_ = target_.flatten()
       # print(target_.shape)
        target_ = boxcox(target_, lmbda=1.6132074271235903)
        tar = target_.reshape(-1,1)
        return tar

    # Function for reversing the transform performed on the target vatiable
    # Read more on how to implement invboxcox (https://stackoverflow.com/questions/26391454/reverse-box-cox-transformation)
    def invboxcox(target):
        print('Inverse Target Transform Called')
        ld = 1.6132074271235903
        return(np.exp(np.log(ld*target+1)/ld))
    
    
    # Custom transformer for changing data type of some features in the pipeline
    class Typ_conv(BaseEstimator, TransformerMixin):

        def fit(self, X, y=None, **fit_params):
            return self
    
        def transform(self, X):
            cols = ['University Rating', 'Research']
            for col in cols:
                X[col] = X[col].astype('object')
            V = pd.DataFrame(X)
#             V.head(3)
            return X.values
        
    # Creating transformer for numeric features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('power_transform', PowerTransformer(method='box-cox'))])

    # Creating transformer for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # This selector allows the pipeliine select only specified features 
    selector = make_pipeline(ColumnSelector(cols=([1,2,3,4,5,6,7])))
    
    # Indices of Numeric Features
    # Exclude the feature 'Serial No'
    numeric_features = [0,1,3,4,5]
    # Indices of Categorical Features
    categorical_features = [2,6]
    # Creating custom transformer for numerical and categorical features
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                               ('cat', categorical_transformer, categorical_features)])
    
   # Combining all transformers into a single pipeline
    pipe = Pipeline(steps=[('tconv', Typ_conv()), 
                           ('select', selector), 
                           ('preprocessor', preprocessor), 
                           ('reg', BayesianRidge())])
    
    #Save the func function to the designated 
    with open(f'{data_path}/{pipeline_file}', 'wb') as func_file:
        pickle.dump(func, func_file)
    
    
    #Save the invboxcox function to the designated 
    with open(f'{data_path}/{pipeline_file}', 'wb') as invbox_file:
        pickle.dump(invboxcox, invbox_file)
    
    
     #Save the pipeline to the designated 
    with open(f'{data_path}/{pipeline_file}', 'wb') as file:
        pickle.dump(pipe, file)
        
    print("Data transformed")

## Training Function 

### Training the data with the BayesianRidge Regressor

In [None]:
@dsl.python_component(
    name='train_op',
    description='training pipeline function for Graduate admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)

def train(data_path, model_file):
    
    # Install all the dependencies inside the function
    import numpy as np
    import pandas as pd
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    import statsmodels.api as sm
    import pylab
    from scipy.stats import kstest, boxcox
    from scipy.special import inv_boxcox
    from sklearn.preprocessing import PowerTransformer, OneHotEncoder, FunctionTransformer
    from sklearn.pipeline import Pipeline, make_pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.compose import ColumnTransformer, TransformedTargetRegressor, make_column_transformer
    from sklearn.base import TransformerMixin, BaseEstimator
    from mlxtend.feature_selection import ColumnSelector
    from sklearn.metrics import mean_squared_error
    from sklearn.linear_model import BayesianRidge
    
    
    #load the preprocessed data
    preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    train = preprocessed_data['train']
    test = preprocessed_data['test']
    X = preprocessed_data['X']
    y = preprocessed_data['y']
    
    
    # Load the saved pipe file
    with open(f'{data_path}/{pipeline_file}', 'rb') as file:
        pipe = pickle.load(file)
        
    # Load the saved func function
    with open(f'{data_path}/{pipeline_file}', 'rb') as func_file:
        func = pickle.load(func_file)
        
    # Load the saved invboxcox function
    with open(f'{data_path}/{pipeline_file}', 'rb') as invbox_file:
        invboxcox = pickle.load(invbox_file)

    # We use Transformed Target Regressor because we are performing some transformation on the target variable
    model = TransformedTargetRegressor(regressor=pipe,func=func,inverse_func=invboxcox)
    
    # Fit model to X and y
    model.fit(X, y)
    
    #Save the model to the designated 
    with open(f'{data_path}/{model_file}', 'wb') as file:
        pickle.dump(model, file)
        
    print("Model Trained")

## Prediction function

### Make prediction on the held-out dataset

In [None]:
@dsl.python_component(
    name='predict_op',
    description='prediction pipeline function for Graduate admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)

def predict(data_path, predict_file):
    
    import pickle     # python object for (de)serialization
    import pandas as pd
    import numpy as np
    # Evaluation metrics
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_absolute_error

    # Load the saved BayesianRidge Regressor model
    with open(f'{data_path}/{model_file}', 'rb') as file:
        model = pickle.load(file)
    
            


    #load the preprocessed data
    preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    train = preprocessed_data['train']
    test = preprocessed_data['test']
    X = preprocessed_data['X']
    y = preprocessed_data['y']
        
        
    # Normally, new/test data does not contain the target variable, 
    # so we drop it from the test in order to use the trained model on it
    x_test = test.drop(['Chance of Admit '], axis=1)
        
    # make prediction on test data
    prediction = model.predict(x_test)
        
    #Save the prediction to the designated 
    with open(f'{data_path}/{predict_file}', 'wb') as file:
        pickle.dump(prediction, file)
        
    print("Prediction has be saved successfully!")

## Prediction Classifier

### Make prediction on Students Data

In [None]:
@dsl.python_component(
    name='predict_classifier_op',
    description='prediction pipeline function for Graduate admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)


def predict_classifier(data_path, predict_class_file):
    
    import pickle     # python object for (de)serialization
    import pandas as pd
    import numpy as np
    # Evaluation metrics
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_absolute_error
    
        
    # Load the saved BayesianRidge Regressor model
    with open(f'{data_path}/{predict_file}', 'rb') as file:
        prediction = pickle.load(file)   
        
    ## checking rmse between target variable in main scale
#     main_test_rmse = np.sqrt(mean_squared_error(test['Chance of Admit '], prediction))
#     print(f'Final RMSE value on test data = {main_test_rmse}')
        
        
    final_pred = pd.DataFrame(prediction,columns = ["predictions"], index = None)
        
    # We create a classifier for the predicted values
    # create a threshold >= 0.70
    final_pred['result'] = np.where(final_pred['predictions'] >= 0.70, 'You have a high chance of getting admitted', 
                                        'Your chances of getting admitted is quite low')
    return final_pred

    
    
    with open(f'{data_path}/prediction_result.txt', 'w') as result:
        result.write(" Prediction: {}, Actual: {} ".format(prediction, test['Chance of Admit ']))
    
    print('Prediction Classifier saved successfully!')

# Build a pipeline component from the function

#### Convert the function to a pipeline operation.

In [None]:
# Create preproces lightweight components.
preprocess_op = comp.func_to_container_op(preprocess, base_image=BASE_IMAGE)

# Create transform lightweight components.
transform_op = comp.func_to_container_op(transform, base_image=BASE_IMAGE)

# Create training lightweight components.
train_op = comp.func_to_container_op(train, base_image=BASE_IMAGE)

# Create prrdiction lightweight components.
predict_op = comp.func_to_container_op(predict, base_image=BASE_IMAGE)

# Create predict_classifier lightweight components.
predict_classifier_op = comp.func_to_container_op(predict_classifier, base_image=BASE_IMAGE)

# Build Kubeflow Pipeline

In [None]:
#Create a client to enable communication with the Pipelines API server.
client = kfp.Client()

In [None]:
# domain-specific language 
@dsl.pipeline(
    name='University Admission',
    description='End-to-end training to predict the likelihood of admission of a new candidate.'
)

# Define parameters to be fed into pipeline
def graduate_admission_container_pipeline(
    data_path: str,
    pipeline_file: str,
    model_file: str,
    predict_file: str,
    predict_class_file: str
    
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="volume_creation",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create graduate admission preprocessing component
    admission_preprocessing_container = preprocess_op(data_path).add_pvolumes({data_path: vop.volume})
    
    # Create graduate admission transform component
    admission_transformation_container = transform_op(data_path, pipeline_file) \
                                        .add_pvolumes({data_path: admission_preprocessing_container.pvolume})
    
    # Create graduate admission training component
    admission_training_container = train_op(data_path, model_file) \
                                    .add_pvolumes({data_path: admission_transformation_container.pvolume})
    
    # Create graduate admission prediction component
    admission_prediction_container = predict_op(data_path, predict_file).add_pvolumes({data_path: admission_training_container.pvolume})
    
    # Create graduate admission prediction classification component
    admission_prediction_classifier_container = predict_classifier_op(data_path, predict_class_file) \
                                                .add_pvolumes({data_path: admission_prediction_container.pvolume})
    
    
    # Print the result of the prediction
    Graduate_admission_result_container = dsl.ContainerOp(
        name="Admission prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: admission_prediction_classifier_container.pvolume},
        arguments=['head', f'{data_path}/prediction_result.txt']
    )

## Compile and run the pipeline

- Kubeflow Pipelines lets you group pipeline runs by Experiments. You can create a new experiment, or call `kfp.Client().list_experiments()` to see existing ones. If you don't specify the experiment name, the Default experiment will be used.

In [None]:
DATA_PATH = '/mnt'
MODEL_PATH = 'graduate_admission_predictor.pkl'

In [None]:
pipeline_func = graduate_admission_container_pipeline

In [None]:
experiment_name=EXPERIMENT_NAME
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,'{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)