# Graduate Admission

### Business Problem:
This Project was built with the purpose of helping students in shortlisting universities with their profiles. The predicted output gives them a fair idea about their chances for a particular university.


### Objective: 
Using the supplied predictive variables (GRE score, TOEFL score, University Rating, etc) to predict the likelihood of admission of a new candidate.



### Data :

The dataset contains several parameters which are considered important during the application for Masters Programs.
The parameters included are :

- GRE Scores ( out of 340 )
- TOEFL Scores ( out of 120 )
- University Rating ( out of 5 )
- Statement of Purpose and Letter of Recommendation Strength ( out of 5 )
- Undergraduate GPA ( out of 10 )
- Research Experience ( either 0 or 1 )
- Chance of Admit ( ranging from 0 to 1 )


### Import Libraries

In [None]:
!python -m pip install --user --upgrade pip
!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22 tensorflow==2.0 keras==1.2.2 --user

`Restart the kernel before you proceed`

In [None]:
import numpy as np
import pandas as  pd
import os
import matplotlib.pyplot as plt

# Evaluation
from sklearn.metrics import mean_squared_error

### Install Kubeflow pipelines SDK

In [None]:
# You may need to restart your notebook kernel after updating the kfp sdk
!pip3 install kfp --upgrade
!pip3 install kfp --upgrade --user
!pip install -U kfp

`Check if the install was successful:`

In [None]:
!which dsl-compile

In [None]:
# Restart kernel after the pip install
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

## Setup

In [None]:
EXPERIMENT_NAME = 'University Admission notebook pipeline'        # Name of the experiment in the UI
BASE_IMAGE = "tensorflow/tensorflow:latest-gpu-py3"    # Base image used for components in the pipeline

### Build the Components

In [None]:
# Import Kubeflow SDK
import kfp # the Pipelines SDK. 
from kfp import compiler
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.components as comp
import os
import subprocess
import json

from kfp.dsl.types import Integer, GCSPath, String
import kfp.notebook

In [None]:
# where the outputs are stored
out_dir = "/home/jovyan/01-University-Admissions/data/out/"

## Create a pipeline Function

## Preprocessing Function

In [None]:
@dsl.python_component(
    name='preprocess_op',
    description='preprocessing function for Graduate admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)

def preprocess(data_path):
    import numpy as np
    import pandas as pd
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    from sklearn.model_selection import KFold
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.preprocessing import PowerTransformer
    from sklearn.model_selection import train_test_split  # splitting the data
    
    # Get data
    DATA_PATH = "https://raw.githubusercontent.com/HamoyeHQ/01-University-Admissions/master/data/"

    def load_admission_data(admission_path=DATA_PATH):
        csv_path = os.path.join(admission_path, "Admission_Predict_Ver1.1.csv")
        return pd.read_csv(csv_path)
    
    # load data from function
    dataset = load_admission_data()
    
    # rename columns
    dataset.rename(columns={'GRE Score':'Gre_Score','TOEFL Score':'TOEFL_Score','University Rating':'University_Rating',
                                       'LOR ':'LOR',  'Chance of Admit ':'Chance_of_Admit'}, inplace = True)
    
    # drop unneccessary column
    dataset = dataset.drop(['Serial No.'], axis=1)
    
    # split the data into X and y
    X = dataset.drop(['Chance_of_Admit'], axis=1)  # predictor
    y = dataset['Chance_of_Admit'] # target(label)
    
    # preprocess using powertransformer
    pt = PowerTransformer(method='box-cox')
    X_trans = pt.fit_transform(X)
    
    # retaining the previous columns
    X = pd.DataFrame(X_trans, columns=X.columns)
    
    #creating dummy variables for University Rating and Research
    X = pd.get_dummies(X, columns=['University_Rating', 'Research'], drop_first=True)
    
    # Split the data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    #output file to path
    np.savez_compressed(f'{data_path}/preprocessed-data.npz', 
                       X_train=X_train,
                       X_test=X_test,
                       y_train=y_train,
                       y_test=y_test)
    print("Preprocessing Done")

## Training Function 

### Training the data with the BayesianRidge Regressor

In [None]:
@dsl.python_component(
    name='train_op',
    description='training function for Graduate admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)

def train(data_path, model_file):
    
    # Install all the dependencies inside the function
    import numpy as np
    import pandas as pd
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    
    # import libraries for training
    from sklearn.linear_model import LinearRegression, BayesianRidge
    
    #load the preprocessed data
    preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    X_train = preprocessed_data['X_train']
    y_train = preprocessed_data['y_train']
    
    # Instantiating the model 
    main_model = BayesianRidge()
    
    # Fit the model to the training data
    main_model.fit(X_train,y_train)
    
    #Save the model to the designated 
    with open(f'{data_path}/{model_file}', 'wb') as file:
        pickle.dump(main_model, file)
        
    print("Model Trained")

In [None]:
# reg_model = train(out_dir, "model")

## Prediction function

In [None]:
@dsl.python_component(
    name='predict_op',
    description='prediction function for Graduate admission',
    base_image=BASE_IMAGE  # you can define the base image here, or when you build in the next step. 
)

def predict(data_path, model_file):
    
    import pickle     # python object for (de)serialization
    import pandas as pd
    import numpy as np
    # Evaluation metrics
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_absolute_error
    
    
    # Load the saved BayesianRidge Regressor model
    with open(f'{data_path}/{model_file}', 'rb') as file:
        main_model = pickle.load(file)
    
    #load the preprocessed data
    preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    X_test = preprocessed_data['X_test']
    y_test = preprocessed_data['y_test']
    
    #Evaluate the model and print the results
    model_pred = main_model.predict(X_test)
    
    # print the RMSE
    print('Model \nRMSE score = {}' .format(np.sqrt(mean_squared_error(y_test, model_pred))))


              
    with open(f'{data_path}/model_result.txt', 'w') as result:
        result.write(" Prediction: {},\nActual: {} ".format(model_pred, y_test))
              
    print('Prediction has be saved successfully!')

In [None]:
# #Evaluate the model and print the results
#     print('Test Accuracy: %.3f' % reg_model.score(X_test, y_test))
    
#     # Save the model 
#     model_filename = "model_file.pkl"
#     with open(f'{data_path}/{model_filename}', 'wb') as file:
#         pickle.dump(reg_model, file)
    
    
#     #Save the test_data as a pickle file to be used by the predict component.
#     with open(f'{data_path}/test_data', 'wb') as f:
#         pickle.dump((X_test,  y_test), f)

In [None]:
# predict(out_dir, "model")

In [None]:
# Create preprocess, train and predict lightweight components.
preprocess_op = comp.func_to_container_op(preprocess, base_image=BASE_IMAGE)
train_op = comp.func_to_container_op(train , base_image=BASE_IMAGE)
predict_op = comp.func_to_container_op(predict , base_image=BASE_IMAGE)

# Build Kubeflow Pipeline

In [None]:
#Create a client to enable communication with the Pipelines API server.
client = kfp.Client()

In [None]:
# domain-specific language 
@dsl.pipeline(
    name='University Admission',
    description='End-to-end training to predict the likelihood of admission of a new candidate.'
)

# Define parameters to be fed into pipeline
def graduate_admission_container_pipeline(
    data_path: str,
    model_file: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="volume_creation",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create graduate admission preprocessing component
    admission_preprocessing_container = preprocess_op(data_path).add_pvolumes({data_path: vop.volume})
    
     # Create graduate admission training component.
    admission_training_container = train_op(data_path, model_file) \
                                    .add_pvolumes({data_path: admission_preprocessing_container.pvolume})
    
    # Create graduate prediction component.
    admission_predict_container = predict_op(data_path, model_file) \
                                    .add_pvolumes({data_path: admission_training_container.pvolume})
    
     # Print the result of the prediction
    Graduate_admission_result_container = dsl.ContainerOp(
        name="Admission prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: admission_predict_container.pvolume},
        arguments=['head', f'{data_path}/model_result.txt']
    )

## Run the Pipeline

Kubeflow Pipelines lets you group pipeline runs by Experiments. 

In [None]:
DATA_PATH = '/mnt'
MODEL_PATH='graduate_admission_predictor.pkl'

In [None]:
pipeline_func = graduate_admission_container_pipeline

In [None]:
experiment_name=EXPERIMENT_NAME
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,'{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)

In [None]:
gcr.io/kubeflow-images-public/tensorflow-2.1.0-notebook-cpu:1.0.0