# RatingBased RatedOnly Model Prediction Preparation

RatingBased RatedOnly Model Table will be prepared for use in Prediction Notebook

This table will be used for supply extra information about recommendation

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
from sklearn.metrics import mean_squared_error as CalculateMSE

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)
print('tensorflow Version: ' + tf.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1
tensorflow Version: 2.0.0


In [3]:
#GPU will be used for Prediction and Evaluation
myGPU = tf.test.gpu_device_name()
if myGPU:
    print(myGPU)
else:
    print("Error")

/device:GPU:0


In [4]:
#Reading Qualified RatingBased RatedOnly training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/RatingBased/RatedOnly/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Rating
0,3448,1096,0.666667
1,7042,465,0.777778
2,4214,3691,0.555556
3,9967,767,0.888889
4,324,1410,0.333333
...,...,...,...
753677,571,7554,0.333333
753678,4795,17048,0.333333
753679,3685,1941,0.555556
753680,1366,658,0.555556


In [5]:
#creating a user array for training input 
trainingUser = trainingDf[['UserId']].values.astype(np.int32)
trainingUser = trainingUser.reshape((trainingUser.shape[0]))
trainingUser

array([3448, 7042, 4214, ..., 3685, 1366, 8048])

In [6]:
#creating a movie array for training input 
trainingMovie = trainingDf[['MovieId']].values.astype(np.int32)
trainingMovie = trainingMovie.reshape((trainingMovie.shape[0]))
trainingMovie

array([1096,  465, 3691, ..., 1941,  658,  865])

In [7]:
#creating an array for training input 
trainingX = [trainingUser, trainingMovie]
trainingX

[array([3448, 7042, 4214, ..., 3685, 1366, 8048]),
 array([1096,  465, 3691, ..., 1941,  658,  865])]

In [8]:
#creating an array for training output 
trainingY = trainingDf[['Rating']].values.astype(np.float32)
trainingY = trainingY.reshape((trainingY.shape[0]))
trainingY

array([0.6666667, 0.7777778, 0.5555556, ..., 0.5555556, 0.5555556,
       0.7777778], dtype=float32)

In [9]:
#Reading Qualified RatingBased RatedOnly validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/RatingBased/RatedOnly/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Rating
0,7981,215,0.777778
1,8165,3195,0.777778
2,5490,174,0.777778
3,5080,802,0.888889
4,7693,344,0.555556
...,...,...,...
123600,3821,265,0.888889
123601,2074,807,1.000000
123602,3930,1942,0.777778
123603,8844,1395,0.777778


In [10]:
#creating a user array for validation input 
validationUser = validationDf[['UserId']].values.astype(np.int32)
validationUser = validationUser.reshape((validationUser.shape[0]))
validationUser

array([7981, 8165, 5490, ..., 3930, 8844, 9284])

In [11]:
#creating a movie array for validation input 
validationMovie = validationDf[['MovieId']].values.astype(np.int32)
validationMovie = validationMovie.reshape((validationMovie.shape[0]))
validationMovie

array([ 215, 3195,  174, ..., 1942, 1395, 5107])

In [12]:
#creating an array for validation input 
validationX = [validationUser, validationMovie]
validationX

[array([7981, 8165, 5490, ..., 3930, 8844, 9284]),
 array([ 215, 3195,  174, ..., 1942, 1395, 5107])]

In [13]:
#creating an array for validation output 
validationY = validationDf[['Rating']].values.astype(np.float32)
validationY = validationY.reshape((validationY.shape[0]))
validationY

array([0.7777778, 0.7777778, 0.7777778, ..., 0.7777778, 0.7777778,
       0.7777778], dtype=float32)

In [14]:
#Reading Qualified RatingBased RatedOnly test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/RatingBased/RatedOnly/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Rating
0,2092,1440,0.777778
1,464,4450,0.777778
2,787,617,0.777778
3,3980,376,1.000000
4,3111,165,0.777778
...,...,...,...
123601,338,4367,0.000000
123602,4526,1536,0.777778
123603,8778,950,0.888889
123604,9104,674,0.888889


In [15]:
#creating a user array for test input 
testUser = testDf[['UserId']].values.astype(np.int32)
testUser = testUser.reshape((testUser.shape[0]))
testUser

array([2092,  464,  787, ..., 8778, 9104, 3898])

In [16]:
#creating a movie array for test input 
testMovie = testDf[['MovieId']].values.astype(np.int32)
testMovie = testMovie.reshape((testMovie.shape[0]))
testMovie

array([1440, 4450,  617, ...,  950,  674, 1336])

In [17]:
#creating an array for test input 
testX = [testUser, testMovie]
testX

[array([2092,  464,  787, ..., 8778, 9104, 3898]),
 array([1440, 4450,  617, ...,  950,  674, 1336])]

In [18]:
#creating an array for test output 
testY = testDf[['Rating']].values.astype(np.float32)
testY = testY.reshape((testY.shape[0]))
testY

array([0.7777778, 0.7777778, 0.7777778, ..., 0.8888889, 0.8888889,
       1.       ], dtype=float32)

In [19]:
#ignore warnings due to Converting sparse IndexedSlices to a dense Tensor of unknown shape warning
warnings.filterwarnings('ignore')

#Best Model For RatingBased RatedOnly dataset loading from h5 file
#See Training7 notebook for more information
model = tf.keras.models.load_model("../Model/RatedOnlyModel/Model15.h5")

In [20]:
#Best Model evaluating with training values
with tf.device('/GPU:0'):
    trainingResult = model.evaluate(x = trainingX, y = trainingY, batch_size = 64, verbose = 0)

In [21]:
#Print trainingResult
trainingResult

[0.028724913444547983,
 0.028633649,
 0.1692148,
 0.012069014,
 0.12644191,
 5789873.5]

In [22]:
#Best Model evaluating with validation values
with tf.device('/GPU:0'):
    validationResult = model.evaluate(x = validationX, y = validationY, batch_size = 64, verbose = 0)

In [23]:
#Print validationResult
validationResult

[0.03695747918841484,
 0.036866788,
 0.19200726,
 0.015572578,
 0.14481673,
 7129178.5]

In [24]:
#Best Model evaluating with test values
with tf.device('/GPU:0'):
    testResult = model.evaluate(x = testX, y = testY, batch_size = 64, verbose = 0)

In [25]:
#Print testResult
testResult

[0.03714138613978981,
 0.037050717,
 0.19248563,
 0.015663018,
 0.14532414,
 7102044.0]

In [26]:
#Creating DataFrame for data and model metrics
modelDf = pd.DataFrame(data = {'ModelData': pd.Series(['Training', 'Validation', 'Test'], dtype='str'),
                               'Loss': pd.Series([trainingResult[0], validationResult[0], testResult[0]], dtype='float'),
                               'Mse': pd.Series([trainingResult[1], validationResult[1], testResult[1]], dtype='float'),
                               'Rmse': pd.Series([trainingResult[2], validationResult[2], testResult[2]], dtype='float'),
                               'Msle': pd.Series([trainingResult[3], validationResult[3], testResult[3]], dtype='float'),
                               'Mae': pd.Series([trainingResult[4], validationResult[4], testResult[4]], dtype='float'),
                               'Mape': pd.Series([trainingResult[5], validationResult[5], testResult[5]], dtype='float')})
modelDf

Unnamed: 0,ModelData,Loss,Mse,Rmse,Msle,Mae,Mape
0,Training,0.028725,0.028634,0.169215,0.012069,0.126442,5789873.5
1,Validation,0.036957,0.036867,0.192007,0.015573,0.144817,7129178.5
2,Test,0.037141,0.037051,0.192486,0.015663,0.145324,7102044.0


In [27]:
#Saving modeldf and Clearing memory
modelDf.to_pickle("../PredictData/ModelTable/RatedOnly.pkl")
del modelDf

In [28]:
#Best Model predicting training Ratings
with tf.device('/GPU:0'):
    predictTraining = model.predict(x = trainingX)
predictTraining

array([[0.6843878 ],
       [0.7851418 ],
       [0.657144  ],
       ...,
       [0.6249933 ],
       [0.6668646 ],
       [0.78145236]], dtype=float32)

In [29]:
#Predicted Ratings appending trainingDf
trainingDf['RatingPredict'] = predictTraining
trainingDf

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,3448,1096,0.666667,0.684388
1,7042,465,0.777778,0.785142
2,4214,3691,0.555556,0.657144
3,9967,767,0.888889,0.829224
4,324,1410,0.333333,0.661041
...,...,...,...,...
753677,571,7554,0.333333,0.656229
753678,4795,17048,0.333333,0.347020
753679,3685,1941,0.555556,0.624993
753680,1366,658,0.555556,0.666865


In [30]:
#Best Model predicting validation Ratings
with tf.device('/GPU:0'):
    predictValidation = model.predict(x = validationX)
predictValidation

array([[0.78510857],
       [0.7156863 ],
       [0.7428454 ],
       ...,
       [0.79854643],
       [0.65844965],
       [0.85336703]], dtype=float32)

In [31]:
#Predicted Ratings appending validationDf
validationDf['RatingPredict'] = predictValidation
validationDf

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,7981,215,0.777778,0.785109
1,8165,3195,0.777778,0.715686
2,5490,174,0.777778,0.742845
3,5080,802,0.888889,0.762635
4,7693,344,0.555556,0.757872
...,...,...,...,...
123600,3821,265,0.888889,0.652247
123601,2074,807,1.000000,0.856629
123602,3930,1942,0.777778,0.798546
123603,8844,1395,0.777778,0.658450


In [32]:
#Best Model predicting test Ratings
with tf.device('/GPU:0'):
    predictTest = model.predict(x = testX)
predictTest

array([[0.6949886 ],
       [0.5770044 ],
       [0.85657793],
       ...,
       [0.8964225 ],
       [0.71449643],
       [0.7256885 ]], dtype=float32)

In [33]:
#Predicted Ratings appending testDf
testDf['RatingPredict'] = predictTest
testDf

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,2092,1440,0.777778,0.694989
1,464,4450,0.777778,0.577004
2,787,617,0.777778,0.856578
3,3980,376,1.000000,0.793805
4,3111,165,0.777778,0.842390
...,...,...,...,...
123601,338,4367,0.000000,0.486816
123602,4526,1536,0.777778,0.417737
123603,8778,950,0.888889,0.896423
123604,9104,674,0.888889,0.714496


In [34]:
#Dataframes merging
allDataFrame = pd.concat([trainingDf, validationDf, testDf]).reset_index(drop=True)
allDataFrame

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,3448,1096,0.666667,0.684388
1,7042,465,0.777778,0.785142
2,4214,3691,0.555556,0.657144
3,9967,767,0.888889,0.829224
4,324,1410,0.333333,0.661041
...,...,...,...,...
1000888,338,4367,0.000000,0.486816
1000889,4526,1536,0.777778,0.417737
1000890,8778,950,0.888889,0.896423
1000891,9104,674,0.888889,0.714496


In [35]:
#Getting all unique UserId from allDataFrame and create new Data Frame from this values
#This Data frame will used as lookup table for prediction script
userRange = range(allDataFrame['UserId'].unique().shape[0])
lookupTable = pd.DataFrame(data = {'User' : userRange})
lookupTable

Unnamed: 0,User
0,0
1,1
2,2
3,3
4,4
...,...
10068,10068
10069,10069
10070,10070
10071,10071


In [36]:
#Calculations for each user
calculations = np.empty(shape = (8, lookupTable.shape[0]))
for i in userRange:
    
    #Training dataframe calculations
    queryFrame = trainingDf[trainingDf['UserId'] == i]
    calculations[0][i] = queryFrame.shape[0]
    if calculations[0][i] > 0:
        calculations[1][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[1][i] = np.nan
        
    #Validaiton dataframe calculations
    queryFrame = validationDf[validationDf['UserId'] == i]
    calculations[2][i] = queryFrame.shape[0]
    if calculations[2][i] > 0:
        calculations[3][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[3][i] = np.nan
    
    #test dataframe calculations
    queryFrame = testDf[testDf['UserId'] == i]
    calculations[4][i] = queryFrame.shape[0]
    if calculations[4][i] > 0:
        calculations[5][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[5][i] = np.nan
    
    #all dataframe calculations
    queryFrame = allDataFrame[allDataFrame['UserId'] == i]
    calculations[6][i] = queryFrame.shape[0]
    if calculations[6][i] > 0:
        calculations[7][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[7][i] = np.nan

#print
calculations

array([[9.00000000e+00, 1.40000000e+01, 8.00000000e+00, ...,
        2.30000000e+01, 3.70000000e+01, 7.49000000e+02],
       [1.36842949e-02, 1.12702879e-02, 2.17711454e-03, ...,
        1.62243139e-02, 2.38392741e-02, 1.24655679e-02],
       [4.00000000e+00, 0.00000000e+00, 3.00000000e+00, ...,
        6.00000000e+00, 3.00000000e+00, 1.02000000e+02],
       ...,
       [2.78854133e-03, 1.63730881e-03,            nan, ...,
        1.43644549e-02, 2.65754191e-02, 1.51546907e-02],
       [1.60000000e+01, 1.50000000e+01, 1.10000000e+01, ...,
        3.40000000e+01, 4.20000000e+01, 9.72000000e+02],
       [9.53303078e-03, 1.06280893e-02, 1.81870743e-02, ...,
        1.37533503e-02, 2.42342339e-02, 1.26873923e-02]])

In [37]:
#Calculations adding lookup table

#Training data representation count
lookupTable['TraingingRep'] = calculations[0].astype(np.int32)
#Training data Correct Predictions
lookupTable['TrainingMSE'] = calculations[1]

#Validation data representation count
lookupTable['ValidationRep'] = calculations[2].astype(np.int32)
#validation data Correct Predictions
lookupTable['ValidationMSE'] = calculations[3]

#Test data representation count
lookupTable['TestRep'] = calculations[4].astype(np.int32)
#test data Correct Predictions
lookupTable['TestMSE'] = calculations[5]

#All data representation count
lookupTable['AllRep'] = calculations[6].astype(np.int32)
#all data Correct Predictions
lookupTable['AllMSE'] = calculations[7]

lookupTable

Unnamed: 0,User,TraingingRep,TrainingMSE,ValidationRep,ValidationMSE,TestRep,TestMSE,AllRep,AllMSE
0,0,9,0.013684,4,0.005251,3,0.002789,16,0.009533
1,1,14,0.011270,0,,1,0.001637,15,0.010628
2,2,8,0.002177,3,0.060880,0,,11,0.018187
3,3,559,0.037968,91,0.048360,86,0.048441,736,0.040477
4,4,59,0.012638,9,0.004960,4,0.005814,72,0.011299
...,...,...,...,...,...,...,...,...,...
10068,10068,17,0.017288,0,,1,0.016439,18,0.017241
10069,10069,20,0.019069,2,0.010898,2,0.038896,24,0.020041
10070,10070,23,0.016224,6,0.003772,5,0.014364,34,0.013753
10071,10071,37,0.023839,3,0.027545,2,0.026575,42,0.024234


In [38]:
#Lookup table save as pkl file
lookupTable.to_pickle("../PredictData/LookupTable/RatedOnly.pkl")
del calculations
del trainingDf
del validationDf
del testDf
del allDataFrame
del lookupTable