## Import Necessary Libraries

In [1]:
import GPy
import numpy as np
import pandas as pd
import time
from datetime import date
from sklearn.model_selection import train_test_split

## Read Train And Test Files

In [2]:
# Input Directory Path (Specify Your Directory Path Here)
input_file_path = r'Main_Directory_path/{}'

# Read Files
Positive_Set = pd.read_csv(input_file_path.format('Secondary_Directory/Positive_Set.txt'),sep='\t')
Negative_Set = pd.read_csv(input_file_path.format('Secondary_Directory/Negative_Set.txt'),sep='\t')

In [3]:
# Visualize the Positive Set Data
print(Positive_Set.shape)
Positive_Set.head()

(200, 25)


Unnamed: 0,client_id,data_Source_MYINFO,data_Source_O2O,platform_code_WEB,platform_code_NON_WEB,platform_code_RM_WEB,platform_code_RM_NON_WEB,User Currency_HKD,User Currency_INR,User Currency_SGD,...,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,AdvisoryRun-(1-10 Times),AdvisoryRun-(>=11 Times),kyc_status
0,14718000,0,1,1,0,0,0,0,0,0,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,1
1,10591000,1,0,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,2.413974,-0.092603,-0.124072,-0.125675,0.0,0.0,1
2,14473400,1,0,1,0,0,0,0,0,0,...,-0.029832,-0.08899,2.525353,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,1
3,11984400,0,1,0,1,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,1
4,18691000,1,0,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,0.723907,-0.124072,-0.125675,0.0,0.0,1


In [4]:
# Visualize the Negative Set Data
print(Negative_Set.shape)
Negative_Set.head()

(200, 25)


Unnamed: 0,client_id,data_Source_MYINFO,data_Source_O2O,platform_code_WEB,platform_code_NON_WEB,platform_code_RM_WEB,platform_code_RM_NON_WEB,User Currency_HKD,User Currency_INR,User Currency_SGD,...,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,AdvisoryRun-(1-10 Times),AdvisoryRun-(>=11 Times),kyc_status
0,13949000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
1,18087000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
2,15449000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
3,13861000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1
4,17517000,0,1,1,0,0,0,0,0,1,...,-0.029832,-0.08899,-0.107997,-0.100656,-0.092603,-0.124072,-0.125675,0.0,0.0,-1


## Data Preprocessing

In [5]:
# Combined Data Set
Combined_dataset = Positive_Set.append(Negative_Set,ignore_index=True)
print(Combined_dataset.shape)

# Create X and Y Matrix
X = Combined_dataset.iloc[:,0:-1]
Y = Combined_dataset.iloc[:,-1]

(400, 25)


### Train Test Split :  Training Set : 90% , Test Set : 10% 

Here, we are spliting our Data Set in to two parts one for Training and one for Test, So after training we will use the tes set for final predictions and we will rank the test set users based on the scores computed.

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42,stratify=Y)

## Model Building and Training

In [7]:
start_time = time.time() # To save start time of execution of code

# Set Seed
np.random.seed(0)

#Name For saving model
model_name = input_file_path.format('Secondary_Directory/GPR_Model')

#
try:
    
    X_Train = X_train.iloc[:,1:]  # Ignore client_ID
    Y_Train = Y_train[:,None]     # This line makes Y_train as 2-dim matrix
    
    # Intialize Dimensions
    Dimensions = X_Train.shape[1]

    # Kernel Building 
    k_rbf = GPy.kern.RBF(input_dim=Dimensions, variance=1, lengthscale=None, ARD=True
                           , active_dims=None, name='rbf', useGPU=False, inv_l=False) 


    # Model Building
    model_rbf = GPy.models.GPRegression(X_Train,Y_Train,k_rbf)  

    # Optmization of Model
    model_rbf.optimize(optimizer=None,start=None,messages=False, max_iters=1000
                              , ipython_notebook=True, clear_after_finish=True) 
    
    # Save the model for later use
    model_rbf._save_model(model_name, compress=False, save_data=True)

except:
    print('Jitter Error')
    # If jitter error occurs try to reset np.random.seed() with some other value
    
end_time = time.time()    
print("Execution Time is ", end_time-start_time, "seconds")

Execution Time is  3.460747241973877 seconds




## Prediction Module: Prediction on Test Set

In [8]:
# Set X_Test and Y_Test
#Test set
X_Test = X_test.iloc[:,1:] # Ignore client_ID
Y_Test = Y_test[:,None] # This line makes Y_test as 2 dim matrix


In [9]:
### Create Global Varible to Save Data

client_ids_list = list(X_test['client_id']) # Save client_id

# Initializing Prediction_results array with all zeros
prediction_mean=None
prediction_var =None
prediction_std =None
Prediction_sum =None
Rank = None

In [10]:
# DataFrame for Prediction Result
result = pd.DataFrame(columns=['client_id','Predicted_Score','Rank','Date of generation'])

In [11]:
### Load the trained model
model_name= input_file_path.format('Secondary_Directory/GPR_Model.json')

# Load X and Y Matrices
import json
with open(model_name) as json_file:
    Model_data = json.load(json_file)
    Train_X = Model_data["X"]
    Train_X = np.array(Train_X)
    Train_Y = Model_data["Y"]
    Train_Y = np.array(Train_Y)

In [12]:
### Prdiction Function

# load Model
model_rbf = GPy.core.model.Model.load_model(model_name)

#  Get Predictions
X_Test= X_Test.to_numpy()
pred_mean,pred_var= model_rbf.predict(X_Test, full_cov=False, Y_metadata=None, kern=None,
                                      likelihood=None, include_likelihood=False) 

# Save Prediction results
prediction_mean = pd.Series(pred_mean.flatten())
prediction_var =  pd.Series(pred_var.flatten())
# Set negative varience occured due to numerical error change it to Zero(0)
prediction_var[prediction_var<0]=0
prediction_std =  pd.Series(np.sqrt(pred_var).flatten())
## Save sum of Mean and Std in one Series , this is our Score for Ranking
Prediction_sum = prediction_mean + prediction_std


# Save result in dataframe
result['client_id'] = client_ids_list
result['Predicted_Score'] = Prediction_sum
result['Date of generation'] = pd.to_datetime(date.today())
result.reset_index(drop=True,inplace=True)

### Generate Ranking of Users By Sorting Them in Descending order of the Score

In [19]:
result = result.sort_values('Predicted_Score',ascending=False)
result['Rank'] =  range(1,len(result)+1)
result.head()

Unnamed: 0,client_id,Predicted_Score,Rank,Date of generation
0,16976400,1.587506,1,2020-05-26
1,17161000,1.409553,2,2020-05-26
2,12100400,1.190046,3,2020-05-26
3,16145400,1.186779,4,2020-05-26
4,13979400,1.056079,5,2020-05-26


In [20]:
# Save Result
result.to_csv(input_file_path.format('Secondary_Directory/Ranking_result.txt'),sep='\t',index=False)