# BinaryLike ObservedOnly Model Prediction Preparation

BinaryLike ObservedOnly Model Table will be prepared for use in Prediction Notebook

This table will be used for supply extra information about recommendation

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)
print('tensorflow Version: ' + tf.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1
tensorflow Version: 2.0.0


In [3]:
#GPU will be used for Prediction and Evaluation
myGPU = tf.test.gpu_device_name()
if myGPU:
    print(myGPU)
else:
    print("Error")

/device:GPU:0


In [4]:
#Reading Qualified BinaryLike ObservedOnly training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,30,882,1
1,2836,119,1
2,6380,1230,0
3,9569,1646,1
4,5301,797,1
...,...,...,...
753549,9027,1666,0
753550,1694,88,1
753551,8853,42,1
753552,1756,2598,1


In [5]:
#creating a user array for training input 
trainingUser = trainingDf[['UserId']].values.astype(np.int32)
trainingUser = trainingUser.reshape((trainingUser.shape[0]))
trainingUser

array([  30, 2836, 6380, ..., 8853, 1756, 9135])

In [6]:
#creating a movie array for training input 
trainingMovie = trainingDf[['MovieId']].values.astype(np.int32)
trainingMovie = trainingMovie.reshape((trainingMovie.shape[0]))
trainingMovie

array([ 882,  119, 1230, ...,   42, 2598, 1834])

In [7]:
#creating an array for training input 
trainingX = [trainingUser, trainingMovie]
trainingX

[array([  30, 2836, 6380, ..., 8853, 1756, 9135]),
 array([ 882,  119, 1230, ...,   42, 2598, 1834])]

In [8]:
#creating an array for training output 
trainingY = trainingDf[['Like']].values.astype(np.int8)
trainingY = trainingY.reshape((trainingY.shape[0]))
trainingY

array([1, 1, 0, ..., 1, 1, 1], dtype=int8)

In [9]:
#Reading Qualified BinaryLike ObservedOnly validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Like
0,2570,197,1
1,9495,2601,1
2,4747,658,1
3,8572,202,0
4,5334,1178,1
...,...,...,...
123664,9721,1914,1
123665,4293,1472,1
123666,7198,277,1
123667,2699,6436,1


In [10]:
#creating a user array for validation input 
validationUser = validationDf[['UserId']].values.astype(np.int32)
validationUser = validationUser.reshape((validationUser.shape[0]))
validationUser

array([2570, 9495, 4747, ..., 7198, 2699,  751])

In [11]:
#creating a movie array for validation input 
validationMovie = validationDf[['MovieId']].values.astype(np.int32)
validationMovie = validationMovie.reshape((validationMovie.shape[0]))
validationMovie

array([ 197, 2601,  658, ...,  277, 6436,  285])

In [12]:
#creating an array for validation input 
validationX = [validationUser, validationMovie]
validationX

[array([2570, 9495, 4747, ..., 7198, 2699,  751]),
 array([ 197, 2601,  658, ...,  277, 6436,  285])]

In [13]:
#creating an array for validation output 
validationY = validationDf[['Like']].values.astype(np.int8)
validationY = validationY.reshape((validationY.shape[0]))
validationY

array([1, 1, 1, ..., 1, 1, 1], dtype=int8)

In [14]:
#Reading Qualified BinaryLike ObservedOnly test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Like
0,8193,163,0
1,2328,2593,1
2,457,1796,0
3,6870,1818,1
4,600,149,1
...,...,...,...
123665,1062,105,1
123666,3176,2296,1
123667,7009,376,1
123668,4864,4898,1


In [15]:
#creating a user array for test input 
testUser = testDf[['UserId']].values.astype(np.int32)
testUser = testUser.reshape((testUser.shape[0]))
testUser

array([8193, 2328,  457, ..., 7009, 4864,  814])

In [16]:
#creating a movie array for test input 
testMovie = testDf[['MovieId']].values.astype(np.int32)
testMovie = testMovie.reshape((testMovie.shape[0]))
testMovie

array([ 163, 2593, 1796, ...,  376, 4898,  731])

In [17]:
#creating an array for test input 
testX = [testUser, testMovie]
testX

[array([8193, 2328,  457, ..., 7009, 4864,  814]),
 array([ 163, 2593, 1796, ...,  376, 4898,  731])]

In [18]:
#creating an array for test output 
testY = testDf[['Like']].values.astype(np.int8)
testY = testY.reshape((testY.shape[0]))
testY

array([0, 1, 0, ..., 1, 1, 1], dtype=int8)

In [19]:
#ignore warnings due to Converting sparse IndexedSlices to a dense Tensor of unknown shape warning
warnings.filterwarnings('ignore')

#Best Model For BinaryLike ObservedOnly dataset loading from h5 file
#See Training3 notebook for more information
model = tf.keras.models.load_model("../Model/ObservedOnlyModel/Model9.h5")

In [20]:
#Best Model evaluating with training values
with tf.device('/GPU:0'):
    trainingResult = model.evaluate(x = trainingX, y = trainingY, batch_size = 64, verbose = 0)

In [21]:
#Print trainingResult
trainingResult

[0.29805378935653504, 596475.0, 73934.0, 62864.0, 20281.0, 0.87497246]

In [22]:
#Best Model evaluating with validation values
with tf.device('/GPU:0'):
    validationResult = model.evaluate(x = validationX, y = validationY, batch_size = 64, verbose = 0)

In [23]:
#Print validationResult
validationResult

[0.37530371850988337, 96258.0, 14130.0, 8035.0, 5246.0, 0.8433237]

In [24]:
#Best Model evaluating with test values
with tf.device('/GPU:0'):
    testResult = model.evaluate(x = testX, y = testY, batch_size = 64, verbose = 0)

In [25]:
#Print testResult
testResult

[0.3741056228624817, 96340.0, 14155.0, 8052.0, 5123.0, 0.8441174]

In [26]:
#Creating DataFrame for data and model metrics
modelDf = pd.DataFrame(data = {'ModelData': pd.Series(['Training', 'Validation', 'Test'], dtype='str'),
                               'Negative' : pd.Series([trainingDf[trainingDf['Like'] == 0].shape[0], validationDf[validationDf['Like'] == 0].shape[0], testDf[testDf['Like'] == 0].shape[0]], dtype='int'),
                               'Positive' : pd.Series([trainingDf[trainingDf['Like'] == 1].shape[0], validationDf[validationDf['Like'] == 1].shape[0], testDf[testDf['Like'] == 1].shape[0]], dtype='int'),
                               'Loss': pd.Series([trainingResult[0], validationResult[0], testResult[0]], dtype='float'),
                               'TP': pd.Series([trainingResult[1], validationResult[1], testResult[1]], dtype='int'),
                               'FP': pd.Series([trainingResult[2], validationResult[2], testResult[2]], dtype='int'),
                               'TN': pd.Series([trainingResult[3], validationResult[3], testResult[3]], dtype='int'),
                               'FN': pd.Series([trainingResult[4], validationResult[4], testResult[4]], dtype='int'),
                               'Accuracy': pd.Series([trainingResult[5], validationResult[5], testResult[5]], dtype='float')})
modelDf

Unnamed: 0,ModelData,Negative,Positive,Loss,TP,FP,TN,FN,Accuracy
0,Training,136798,616756,0.298054,596475,73934,62864,20281,0.874972
1,Validation,22165,101504,0.375304,96258,14130,8035,5246,0.843324
2,Test,22207,101463,0.374106,96340,14155,8052,5123,0.844117


In [27]:
#Saving modeldf and Clearing memory
modelDf.to_pickle("../PredictData/ModelTable/ObservedOnly.pkl")
del modelDf

In [28]:
#Best Model predicting training Likes
with tf.device('/GPU:0'):
    predictTraining = model.predict(x = trainingX)
predictTraining

array([[0.99362385],
       [0.99790585],
       [0.6921729 ],
       ...,
       [0.9981781 ],
       [0.9128264 ],
       [0.9987264 ]], dtype=float32)

In [29]:
#Predicted Likes appending trainingDf
trainingDf['LikePredict'] = predictTraining
trainingDf

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,30,882,1,0.993624
1,2836,119,1,0.997906
2,6380,1230,0,0.692173
3,9569,1646,1,0.931085
4,5301,797,1,0.948660
...,...,...,...,...
753549,9027,1666,0,0.786528
753550,1694,88,1,0.860511
753551,8853,42,1,0.998178
753552,1756,2598,1,0.912826


In [30]:
#Best Model predicting validation Likes
with tf.device('/GPU:0'):
    predictValidation = model.predict(x = validationX)
predictValidation

array([[0.98871344],
       [0.9938812 ],
       [0.9521258 ],
       ...,
       [0.96491206],
       [0.8239964 ],
       [0.9452547 ]], dtype=float32)

In [31]:
#Predicted Likes appending validaitonDf
validationDf['LikePredict'] = predictValidation
validationDf

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,2570,197,1,0.988713
1,9495,2601,1,0.993881
2,4747,658,1,0.952126
3,8572,202,0,0.118017
4,5334,1178,1,0.700384
...,...,...,...,...
123664,9721,1914,1,0.836436
123665,4293,1472,1,0.843511
123666,7198,277,1,0.964912
123667,2699,6436,1,0.823996


In [32]:
#Best Model predicting test Likes
with tf.device('/GPU:0'):
    predictTest = model.predict(x = testX)
predictTest

array([[0.8570026 ],
       [0.9884863 ],
       [0.49047133],
       ...,
       [0.94512653],
       [0.6153875 ],
       [0.6706204 ]], dtype=float32)

In [33]:
#Predicted Likes appending testDf
testDf['LikePredict'] = predictTest
testDf

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,8193,163,0,0.857003
1,2328,2593,1,0.988486
2,457,1796,0,0.490471
3,6870,1818,1,0.289758
4,600,149,1,0.529459
...,...,...,...,...
123665,1062,105,1,0.986430
123666,3176,2296,1,0.998414
123667,7009,376,1,0.945127
123668,4864,4898,1,0.615387


In [34]:
#Dataframes merging
allDataFrame = pd.concat([trainingDf, validationDf, testDf]).reset_index(drop=True)
allDataFrame

Unnamed: 0,UserId,MovieId,Like,LikePredict
0,30,882,1,0.993624
1,2836,119,1,0.997906
2,6380,1230,0,0.692173
3,9569,1646,1,0.931085
4,5301,797,1,0.948660
...,...,...,...,...
1000888,1062,105,1,0.986430
1000889,3176,2296,1,0.998414
1000890,7009,376,1,0.945127
1000891,4864,4898,1,0.615387


In [35]:
#Getting all unique UserId from allDataFrame and create new Data Frame from this values
#This Data frame will used as lookup table for prediction script
userRange = range(allDataFrame['UserId'].unique().shape[0])
lookupTable = pd.DataFrame(data = {'User' : userRange})
lookupTable

Unnamed: 0,User
0,0
1,1
2,2
3,3
4,4
...,...
10068,10068
10069,10069
10070,10070
10071,10071


In [36]:
#Defined function will be checked if prediction is true
#rounding prediction returns closest label (0 or 1)
def CorrectPrediction(like, prediction):
    result = 0
    for i in range(len(like)):
        if like[i] == int(np.round(prediction[i])):
            result += 1
    return result

In [37]:
#Calculations for each user
calculations = np.empty(shape = (8, lookupTable.shape[0]))
for i in userRange:
    
    #Training dataframe calculations
    queryFrame = trainingDf[trainingDf['UserId'] == i]
    calculations[0][i] = queryFrame.shape[0]
    calculations[1][i] = CorrectPrediction(queryFrame['Like'].values, queryFrame['LikePredict'].values)

        
    #Validaiton dataframe calculations
    queryFrame = validationDf[validationDf['UserId'] == i]
    calculations[2][i] = queryFrame.shape[0]
    calculations[3][i] = CorrectPrediction(queryFrame['Like'].values, queryFrame['LikePredict'].values)
    
    #Test dataframe calculations
    queryFrame = testDf[testDf['UserId'] == i]
    calculations[4][i] = queryFrame.shape[0]
    calculations[5][i] = CorrectPrediction(queryFrame['Like'].values, queryFrame['LikePredict'].values)

    
    #All dataframe calculations
    #Since All dataframe contains training, validation and test dataframes
    #All dataframe results can be calculated by simply summing the training, validation and test results
    calculations[6][i] = calculations[0][i] + calculations[2][i] + calculations[4][i]
    calculations[7][i] = calculations[1][i] + calculations[3][i] + calculations[5][i]


#print
calculations

array([[ 12.,  11.,   9., ...,  23.,  32., 729.],
       [ 11.,  11.,   7., ...,  22.,  32., 619.],
       [  2.,   2.,   1., ...,   4.,   6., 125.],
       ...,
       [  2.,   2.,   1., ...,   7.,   3., 112.],
       [ 16.,  15.,  11., ...,  34.,  42., 972.],
       [ 14.,  15.,   9., ...,  33.,  41., 838.]])

In [38]:
#Calculations adding lookup table

#Training data representation count
lookupTable['TraingingRep'] = calculations[0].astype(np.int32)
#Training data Correct Predictions
lookupTable['TrainingCorrect'] = calculations[1].astype(np.int32)

#Validation data representation count
lookupTable['ValidationRep'] = calculations[2].astype(np.int32)
#validation data Correct Predictions
lookupTable['ValidationCorrect'] = calculations[3].astype(np.int32)

#Test data representation count
lookupTable['TestRep'] = calculations[4].astype(np.int32)
#test data Correct Predictions
lookupTable['TestCorrect'] = calculations[5].astype(np.int32)

#All data representation count
lookupTable['AllRep'] = calculations[6].astype(np.int32)
#all data Correct Predictions
lookupTable['AllCorrect'] = calculations[7].astype(np.int32)

lookupTable

Unnamed: 0,User,TraingingRep,TrainingCorrect,ValidationRep,ValidationCorrect,TestRep,TestCorrect,AllRep,AllCorrect
0,0,12,11,2,1,2,2,16,14
1,1,11,11,2,2,2,2,15,15
2,2,9,7,1,1,1,1,11,9
3,3,544,456,101,81,91,67,736,604
4,4,61,60,5,5,6,6,72,71
...,...,...,...,...,...,...,...,...,...
10068,10068,13,13,3,3,2,2,18,18
10069,10069,18,18,6,6,0,0,24,24
10070,10070,23,22,4,4,7,7,34,33
10071,10071,32,32,6,6,4,3,42,41


In [39]:
#Lookup table save as pkl file
lookupTable.to_pickle("../PredictData/LookupTable/ObservedOnly.pkl")
del calculations
del trainingDf
del validationDf
del testDf
del allDataFrame
del lookupTable