# BinaryLike UnobservedSample Model Prediction Preparation

BinaryLike UnobservedSample Model Table will be prepared for use in Prediction Notebook

This table will be used for supply extra information about recommendation

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)
print('tensorflow Version: ' + tf.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1
tensorflow Version: 2.0.0


In [3]:
#GPU will be used for Prediction and Evaluation
myGPU = tf.test.gpu_device_name()
if myGPU:
    print(myGPU)
else:
    print("Error")

/device:GPU:0


In [4]:
#Reading Qualified BinaryLike UnobservedSample training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,6741,2320,1
1,129,777,0
2,5175,8304,1
3,1133,1334,1
4,8109,2270,1
...,...,...,...
760748,2386,168,1
760749,696,1784,1
760750,9237,105,1
760751,7232,204,1


In [5]:
#creating a user array for training input 
trainingUser = trainingDf[['UserId']].values.astype(np.int32)
trainingUser = trainingUser.reshape((trainingUser.shape[0]))
trainingUser

array([6741,  129, 5175, ..., 9237, 7232, 3007])

In [6]:
#creating a movie array for training input 
trainingMovie = trainingDf[['MovieId']].values.astype(np.int32)
trainingMovie = trainingMovie.reshape((trainingMovie.shape[0]))
trainingMovie

array([2320,  777, 8304, ...,  105,  204, 1333])

In [7]:
#creating an array for training input 
trainingX = [trainingUser, trainingMovie]
trainingX

[array([6741,  129, 5175, ..., 9237, 7232, 3007]),
 array([2320,  777, 8304, ...,  105,  204, 1333])]

In [8]:
#creating an array for training output 
trainingY = trainingDf[['Like']].values.astype(np.int8)
trainingY = trainingY.reshape((trainingY.shape[0]))
trainingY

array([1, 0, 1, ..., 1, 1, 1], dtype=int8)

In [9]:
#Reading Qualified BinaryLike UnobservedSample validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Like
0,8039,20761,0
1,1504,121,1
2,6126,5947,1
3,3371,106,1
4,7280,578,1
...,...,...,...
125069,313,885,1
125070,4108,215,1
125071,5532,863,1
125072,4202,4403,1


In [10]:
#creating a user array for validation input 
validationUser = validationDf[['UserId']].values.astype(np.int32)
validationUser = validationUser.reshape((validationUser.shape[0]))
validationUser

array([8039, 1504, 6126, ..., 5532, 4202, 8131])

In [11]:
#creating a movie array for validation input 
validationMovie = validationDf[['MovieId']].values.astype(np.int32)
validationMovie = validationMovie.reshape((validationMovie.shape[0]))
validationMovie

array([20761,   121,  5947, ...,   863,  4403,   190])

In [12]:
#creating an array for validation input 
validationX = [validationUser, validationMovie]
validationX

[array([8039, 1504, 6126, ..., 5532, 4202, 8131]),
 array([20761,   121,  5947, ...,   863,  4403,   190])]

In [13]:
#creating an array for validation output 
validationY = validationDf[['Like']].values.astype(np.int8)
validationY = validationY.reshape((validationY.shape[0]))
validationY

array([0, 1, 1, ..., 1, 1, 1], dtype=int8)

In [14]:
#Reading Qualified BinaryLike UnobservedSample test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Like
0,3969,156,1
1,6121,18850,1
2,7735,214,1
3,5553,1387,1
4,80,2775,0
...,...,...,...
125069,3249,15383,0
125070,5521,123,1
125071,6506,10906,1
125072,9290,2456,1


In [15]:
#creating a user array for test input 
testUser = testDf[['UserId']].values.astype(np.int32)
testUser = testUser.reshape((testUser.shape[0]))
testUser

array([3969, 6121, 7735, ..., 6506, 9290, 1131])

In [16]:
#creating a movie array for test input 
testMovie = testDf[['MovieId']].values.astype(np.int32)
testMovie = testMovie.reshape((testMovie.shape[0]))
testMovie

array([  156, 18850,   214, ..., 10906,  2456,  7020])

In [17]:
#creating an array for test input 
testX = [testUser, testMovie]
testX

[array([3969, 6121, 7735, ..., 6506, 9290, 1131]),
 array([  156, 18850,   214, ..., 10906,  2456,  7020])]

In [18]:
#creating an array for test output 
testY = testDf[['Like']].values.astype(np.int8)
testY = testY.reshape((testY.shape[0]))
testY

array([1, 1, 1, ..., 1, 1, 1], dtype=int8)

In [19]:
#ignore warnings due to Converting sparse IndexedSlices to a dense Tensor of unknown shape warning
warnings.filterwarnings('ignore')

#Best Model For BinaryLike UnobservedSample dataset loading from h5 file
#See Training4 notebook for more information
model = tf.keras.models.load_model("../Model/UnobservedSampleModel/Model6.h5")

In [20]:
#Best Model evaluating with training values
with tf.device('/GPU:0'):
    trainingResult = model.evaluate(x = trainingX, y = trainingY, batch_size = 64, verbose = 0)

In [21]:
#Print trainingResult
trainingResult

[0.3130823365945499, 593015.0, 74714.0, 69886.0, 23138.0, 0.8713748]

In [22]:
#Best Model evaluating with validation values
with tf.device('/GPU:0'):
    validationResult = model.evaluate(x = validationX, y = validationY, batch_size = 64, verbose = 0)

In [23]:
#Print validationResult
validationResult

[0.3850714097375737, 96009.0, 14225.0, 8913.0, 5927.0, 0.8388794]

In [24]:
#Best Model evaluating with test values
with tf.device('/GPU:0'):
    testResult = model.evaluate(x = testX, y = testY, batch_size = 64, verbose = 0)

In [25]:
#Print testResult
testResult

[0.38754332537419417, 95670.0, 14416.0, 9024.0, 5964.0, 0.83705646]

In [26]:
#Creating DataFrame for data and model metrics
modelDf = pd.DataFrame(data = {'ModelData': pd.Series(['Training', 'Validation', 'Test'], dtype='str'),
                               'Negative' : pd.Series([trainingDf[trainingDf['Like'] == 0].shape[0], validationDf[validationDf['Like'] == 0].shape[0], testDf[testDf['Like'] == 0].shape[0]], dtype='int'),
                               'Positive' : pd.Series([trainingDf[trainingDf['Like'] == 1].shape[0], validationDf[validationDf['Like'] == 1].shape[0], testDf[testDf['Like'] == 1].shape[0]], dtype='int'),
                               'Loss': pd.Series([trainingResult[0], validationResult[0], testResult[0]], dtype='float'),
                               'TP': pd.Series([trainingResult[1], validationResult[1], testResult[1]], dtype='int'),
                               'FP': pd.Series([trainingResult[2], validationResult[2], testResult[2]], dtype='int'),
                               'TN': pd.Series([trainingResult[3], validationResult[3], testResult[3]], dtype='int'),
                               'FN': pd.Series([trainingResult[4], validationResult[4], testResult[4]], dtype='int'),
                               'Accuracy': pd.Series([trainingResult[5], validationResult[5], testResult[5]], dtype='float')})
modelDf

Unnamed: 0,ModelData,Negative,Positive,Loss,TP,FP,TN,FN,Accuracy
0,Training,144600,616153,0.313082,593015,74714,69886,23138,0.871375
1,Validation,23138,101936,0.385071,96009,14225,8913,5927,0.838879
2,Test,23440,101634,0.387543,95670,14416,9024,5964,0.837056


In [27]:
#Saving modeldf and Clearing memory
modelDf.to_pickle("../PredictData/ModelTable/UnobservedSample.pkl")
del modelDf

In [28]:
#Best Model predicting training Likes
with tf.device('/GPU:0'):
    predictTraining = model.predict(x = trainingX)
predictTraining

array([[0.9025768 ],
       [0.16093308],
       [0.82574975],
       ...,
       [0.99562824],
       [0.7921613 ],
       [0.9823971 ]], dtype=float32)

In [29]:
#Predicted Likes appending trainingDf
trainingDf['LikePredict'] = predictTraining
trainingDf

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,6741,2320,1,0.902577
1,129,777,0,0.160933
2,5175,8304,1,0.825750
3,1133,1334,1,0.987776
4,8109,2270,1,0.536689
...,...,...,...,...
760748,2386,168,1,0.995255
760749,696,1784,1,0.974493
760750,9237,105,1,0.995628
760751,7232,204,1,0.792161


In [30]:
#Best Model predicting validation Likes
with tf.device('/GPU:0'):
    predictValidation = model.predict(x = validationX)
predictValidation

array([[0.37674278],
       [0.93784237],
       [0.87418205],
       ...,
       [0.9943577 ],
       [0.9463296 ],
       [0.9676502 ]], dtype=float32)

In [31]:
#Predicted Likes appending validaitonDf
validationDf['LikePredict'] = predictValidation
validationDf

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,8039,20761,0,0.376743
1,1504,121,1,0.937842
2,6126,5947,1,0.874182
3,3371,106,1,0.992425
4,7280,578,1,0.976012
...,...,...,...,...
125069,313,885,1,0.996223
125070,4108,215,1,0.962785
125071,5532,863,1,0.994358
125072,4202,4403,1,0.946330


In [32]:
#Best Model predicting test Likes
with tf.device('/GPU:0'):
    predictTest = model.predict(x = testX)
predictTest

array([[0.84018683],
       [0.24586254],
       [0.9568163 ],
       ...,
       [0.9433457 ],
       [0.9671682 ],
       [0.96693724]], dtype=float32)

In [33]:
#Predicted Likes appending testDf
testDf['LikePredict'] = predictTest
testDf

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,3969,156,1,0.840187
1,6121,18850,1,0.245863
2,7735,214,1,0.956816
3,5553,1387,1,0.879723
4,80,2775,0,0.817993
...,...,...,...,...
125069,3249,15383,0,0.671511
125070,5521,123,1,0.997955
125071,6506,10906,1,0.943346
125072,9290,2456,1,0.967168


In [34]:
#Dataframes merging
allDataFrame = pd.concat([trainingDf, validationDf, testDf]).reset_index(drop=True)
allDataFrame

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,6741,2320,1,0.902577
1,129,777,0,0.160933
2,5175,8304,1,0.825750
3,1133,1334,1,0.987776
4,8109,2270,1,0.536689
...,...,...,...,...
1010896,3249,15383,0,0.671511
1010897,5521,123,1,0.997955
1010898,6506,10906,1,0.943346
1010899,9290,2456,1,0.967168


In [35]:
#Getting all unique UserId from allDataFrame and create new Data Frame from this values
#This Data frame will used as lookup table for prediction script
userRange = range(allDataFrame['UserId'].unique().shape[0])
lookupTable = pd.DataFrame(data = {'User' : userRange})
lookupTable

Unnamed: 0,User
0,0
1,1
2,2
3,3
4,4
...,...
10068,10068
10069,10069
10070,10070
10071,10071


In [36]:
#Defined function will be checked if prediction is true
#rounding prediction returns closest label (0 or 1)
def CorrectPrediction(like, prediction):
    result = 0
    for i in range(len(like)):
        if like[i] == int(np.round(prediction[i])):
            result += 1
    return result

In [37]:
#Calculations for each user
calculations = np.empty(shape = (8, lookupTable.shape[0]))
for i in userRange:
    
    #Training dataframe calculations
    queryFrame = trainingDf[trainingDf['UserId'] == i]
    calculations[0][i] = queryFrame.shape[0]
    calculations[1][i] = CorrectPrediction(queryFrame['Like'].values, queryFrame['LikePredict'].values)

        
    #Validaiton dataframe calculations
    queryFrame = validationDf[validationDf['UserId'] == i]
    calculations[2][i] = queryFrame.shape[0]
    calculations[3][i] = CorrectPrediction(queryFrame['Like'].values, queryFrame['LikePredict'].values)
    
    #Test dataframe calculations
    queryFrame = testDf[testDf['UserId'] == i]
    calculations[4][i] = queryFrame.shape[0]
    calculations[5][i] = CorrectPrediction(queryFrame['Like'].values, queryFrame['LikePredict'].values)

    
    #All dataframe calculations
    #Since All dataframe contains training, validation and test dataframes
    #All dataframe results can be calculated by simply summing the training, validation and test results
    calculations[6][i] = calculations[0][i] + calculations[2][i] + calculations[4][i]
    calculations[7][i] = calculations[1][i] + calculations[3][i] + calculations[5][i]


#print
calculations

array([[ 13.,  11.,  10., ...,  26.,  32., 751.],
       [ 11.,  11.,   9., ...,  24.,  31., 643.],
       [  1.,   2.,   2., ...,   4.,   8., 118.],
       ...,
       [  2.,   2.,   1., ...,   6.,   3.,  91.],
       [ 16.,  15.,  13., ...,  36.,  44., 974.],
       [ 14.,  15.,  10., ...,  34.,  41., 832.]])

In [38]:
#Calculations adding lookup table

#Training data representation count
lookupTable['TraingingRep'] = calculations[0].astype(np.int32)
#Training data Correct Predictions
lookupTable['TrainingCorrect'] = calculations[1].astype(np.int32)

#Validation data representation count
lookupTable['ValidationRep'] = calculations[2].astype(np.int32)
#validation data Correct Predictions
lookupTable['ValidationCorrect'] = calculations[3].astype(np.int32)

#Test data representation count
lookupTable['TestRep'] = calculations[4].astype(np.int32)
#test data Correct Predictions
lookupTable['TestCorrect'] = calculations[5].astype(np.int32)

#All data representation count
lookupTable['AllRep'] = calculations[6].astype(np.int32)
#all data Correct Predictions
lookupTable['AllCorrect'] = calculations[7].astype(np.int32)

lookupTable

Unnamed: 0,User,TraingingRep,TrainingCorrect,ValidationRep,ValidationCorrect,TestRep,TestCorrect,AllRep,AllCorrect
0,0,13,11,1,1,2,2,16,14
1,1,11,11,2,2,2,2,15,15
2,2,10,9,2,0,1,1,13,10
3,3,561,474,81,63,94,80,736,617
4,4,57,56,9,9,7,7,73,72
...,...,...,...,...,...,...,...,...,...
10068,10068,13,13,3,3,4,4,20,20
10069,10069,18,18,5,5,3,3,26,26
10070,10070,26,24,4,4,6,6,36,34
10071,10071,32,31,8,7,4,3,44,41


In [39]:
#Lookup table save as pkl file
lookupTable.to_pickle("../PredictData/LookupTable/UnobservedSample.pkl")
del calculations
del trainingDf
del validationDf
del testDf
del allDataFrame
del lookupTable