## Import Necessary Libraries

In [1]:
import GPy
import numpy as np
import pandas as pd
import time

## Read Train And Test Files

In [2]:
# Input Directory Path (Specify Your Directory Path Here)
input_file_path = r'C:/Users/hp/Desktop/Kristal.AI-AllFiles/Signup-KYC/FINAL/SGP_For_MediumArticle/{}'

# Read Files
Positive_Set = pd.read_csv(input_file_path.format('Model_Train_IO/Positive_Set.txt'),sep='\t')
Negative_Set = pd.read_csv(input_file_path.format('Model_Train_IO/Negative_Set.txt'),sep='\t')

In [3]:
# Visualize the Positive Set Data
print(Positive_Set.shape)
Positive_Set.head()

(200, 25)


Unnamed: 0,client_id,data_Source_MYINFO,data_Source_O2O,platform_code_WEB,platform_code_NON_WEB,platform_code_RM_WEB,platform_code_RM_NON_WEB,User Currency_HKD,User Currency_INR,User Currency_SGD,...,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,AdvisoryRun-(1-10 Times),AdvisoryRun-(>=11 Times),kyc_status
0,14718000,0,1,1,0,0,0,0,0,0,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,1
1,10591000,1,0,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,2.413974,-0.092603,-0.124072,-0.125675,0.0,0.0,1
2,14473400,1,0,1,0,0,0,0,0,0,...,-0.029832,-0.08899,2.525353,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,1
3,11984400,0,1,0,1,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,1
4,18691000,1,0,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,0.723907,-0.124072,-0.125675,0.0,0.0,1


In [4]:
# Visualize the Negative Set Data
print(Negative_Set.shape)
Negative_Set.head()

(200, 25)


Unnamed: 0,client_id,data_Source_MYINFO,data_Source_O2O,platform_code_WEB,platform_code_NON_WEB,platform_code_RM_WEB,platform_code_RM_NON_WEB,User Currency_HKD,User Currency_INR,User Currency_SGD,...,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,AdvisoryRun-(1-10 Times),AdvisoryRun-(>=11 Times),kyc_status
0,13949000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
1,18087000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
2,15449000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
3,13861000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
4,17517000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1


## Model Building

In [5]:
# Initialize Value of K for K-fold CV
num_folds = 2

In [6]:
start_time = time.time() # To save start time of execution of code
# Set Seed
np.random.seed(0)

try:
    
    # For saving Predicted prob and True labels
    True_label = None
    Predicted_probability = None
    
    # Split Postive set into num_folds groups
    positive_samples = np.random.choice(range(num_folds),size=Positive_Set.shape[0])
    
    # Split Negative set into num_folds groups
    negative_samples = np.random.choice(range(num_folds),size=Negative_Set.shape[0])
    
    for inner_idx in range(num_folds): # 2-Fold Cross validation
            # Test set indexes
            X1_idx_for_test = (np.where(positive_samples == inner_idx))[0]
            X2_idx_for_test = (np.where(negative_samples == inner_idx))[0]

            # Training set indexes
            X1_idx_for_trng = (np.where(positive_samples != inner_idx))[0]
            X2_idx_for_trng = (np.where(negative_samples != inner_idx))[0]

            # Training set
            X_train_all = Positive_Set.iloc[X1_idx_for_trng, :].append(Negative_Set.iloc[X2_idx_for_trng, :]\
                                                                             , ignore_index=True)
            X_train = X_train_all.iloc[:, 1:-1] # Ignore client_ID and KYC status
            Y_train = X_train_all.iloc[:, -1][:,None] # Save labels and reshape it in 2 Dimension

            # Test set
            X_test_all = Positive_Set.iloc[X1_idx_for_test, :].append(Negative_Set.iloc[X2_idx_for_test, :],\
                                                                            ignore_index=True)
            X_test = X_test_all.iloc[:, 1:-1] # Ignore client_ID and KYC status
            Y_test = X_test_all.iloc[:, -1][:,None] # Save labels and reshape it in 2 Dimension
            
            # Initialize Dimensions
            Dimensions = X_train.shape[1]
            
            # Kernel Building
            k_rbf = GPy.kern.RBF(input_dim=Dimensions, variance=1, lengthscale=None, ARD=True
                                 , active_dims=None, name='rbf', useGPU=False, inv_l=False)

            # Model Building
            model_rbf = GPy.models.GPClassification(X_train, Y_train,kernel=k_rbf)  
            # (X, Y, kernel=None, Y_metadata=None, mean_function=None, inference_method=None, likelihood=None, normalizer=False)

            # Optimize the model
            model_rbf.optimize(optimizer=None,start=None,messages=False, max_iters=1000
                               , ipython_notebook=False, clear_after_finish=True) 
            #(optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, clear_after_finish=False)

            # Prediction
            X_test= X_test.to_numpy() # Convert DataFrame object to numpy array
            prediction_proba= model_rbf.predict(X_test)[0] 
            #(Xtest, full_cov=False, Y_metadata=None, kern=None, likelihood=None, include_likelihood=True)
            
            # Save Predicted Probability and Actual labesl in global variable
            temp_ytest = pd.Series(Y_test.flatten())
            True_label = pd.concat([True_label,temp_ytest],axis=0,ignore_index=True)
            temp_pred_prob = pd.Series(prediction_proba.flatten()).round(2)
            Predicted_probability = pd.concat([Predicted_probability,temp_pred_prob],axis=0,ignore_index=True)

except:
    print('Jitter Error')
    # If jitter error occurs try to reset np.random.seed() with some other value
    
end_time = time.time()    
print("Execution Time is ", end_time-start_time, "seconds")

Execution Time is  1.729358434677124 seconds


### Save Prediction Result with True Labels per iteration of Cross Validation into One DataFrame

In [7]:
Classification_result = pd.DataFrame(columns = ['True_Label','Predicted_Probability','Predicted_Label'])
Classification_result['True_Label'] = True_label
Classification_result['Predicted_Probability'] = Predicted_probability
Classification_result['Predicted_Label']  = -1 # Initialize column with -1
# Since predicted probability >=0.5 indicate prediction of label 1 we create one more column to save predicted labels
idx_where_predicted_one = np.where(Classification_result['Predicted_Probability']>=0.5)[0]
Classification_result.loc[idx_where_predicted_one,'Predicted_Label'] = 1

In [8]:
Classification_result

Unnamed: 0,True_Label,Predicted_Probability,Predicted_Label
0,1,0.35,-1
1,1,0.16,-1
2,1,0.60,1
3,1,0.90,1
4,1,0.96,1
...,...,...,...
395,-1,0.19,-1
396,-1,0.19,-1
397,-1,0.19,-1
398,-1,0.19,-1


## Model Evaluation

In [9]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

In [10]:
print('Accuracy -- > ',accuracy_score(Classification_result['True_Label'],Classification_result['Predicted_Label']))
print('Precision -- > ',precision_score(Classification_result['True_Label'],Classification_result['Predicted_Label']))
print('Recall -- > ',recall_score(Classification_result['True_Label'],Classification_result['Predicted_Label']))
print('F1-Score -- > ',f1_score(Classification_result['True_Label'],Classification_result['Predicted_Label']))

Accuracy -- >  0.8225
Precision -- >  0.8685714285714285
Recall -- >  0.76
F1-Score -- >  0.8106666666666666


In [11]:
confusion_matrix(Classification_result['True_Label'],Classification_result['Predicted_Label'],labels=[1,-1])

array([[152,  48],
       [ 23, 177]], dtype=int64)