# RatingBased UnratedSample Model Prediction Preparation

RatingBased UnratedSample Model Table will be prepared for use in Prediction Notebook

This table will be used for supply extra information about recommendation

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
from sklearn.metrics import mean_squared_error as CalculateMSE

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)
print('tensorflow Version: ' + tf.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1
tensorflow Version: 2.0.0


In [3]:
#GPU will be used for Prediction and Evaluation
myGPU = tf.test.gpu_device_name()
if myGPU:
    print(myGPU)
else:
    print("Error")

/device:GPU:0


In [4]:
#Reading Qualified RatingBased UnratedSample training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/RatingBased/UnratedSample/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Rating
0,5567,5871,0.6
1,8242,9624,0.1
2,4691,285,0.6
3,6937,2827,0.6
4,6948,4235,1.0
...,...,...,...
760698,939,838,1.0
760699,3757,806,0.4
760700,7723,5315,0.8
760701,4152,1670,0.8


In [5]:
#creating a user array for training input 
trainingUser = trainingDf[['UserId']].values.astype(np.int32)
trainingUser = trainingUser.reshape((trainingUser.shape[0]))
trainingUser

array([5567, 8242, 4691, ..., 7723, 4152, 3158])

In [6]:
#creating a movie array for training input 
trainingMovie = trainingDf[['MovieId']].values.astype(np.int32)
trainingMovie = trainingMovie.reshape((trainingMovie.shape[0]))
trainingMovie

array([5871, 9624,  285, ..., 5315, 1670,  247])

In [7]:
#creating an array for training input 
trainingX = [trainingUser, trainingMovie]
trainingX

[array([5567, 8242, 4691, ..., 7723, 4152, 3158]),
 array([5871, 9624,  285, ..., 5315, 1670,  247])]

In [8]:
#creating an array for training output 
trainingY = trainingDf[['Rating']].values.astype(np.float32)
trainingY = trainingY.reshape((trainingY.shape[0]))
trainingY

array([0.6, 0.1, 0.6, ..., 0.8, 0.8, 0.8], dtype=float32)

In [9]:
#Reading Qualified RatingBased UnratedSample validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/RatingBased/UnratedSample/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Rating
0,7411,7484,0.8
1,2024,6413,0.5
2,9999,8864,0.8
3,1522,626,0.7
4,4208,566,0.9
...,...,...,...
125094,3440,224,0.6
125095,4940,28,0.7
125096,7697,945,0.8
125097,4064,139,0.6


In [10]:
#creating a user array for validation input 
validationUser = validationDf[['UserId']].values.astype(np.int32)
validationUser = validationUser.reshape((validationUser.shape[0]))
validationUser

array([7411, 2024, 9999, ..., 7697, 4064, 3981])

In [11]:
#creating a movie array for validation input 
validationMovie = validationDf[['MovieId']].values.astype(np.int32)
validationMovie = validationMovie.reshape((validationMovie.shape[0]))
validationMovie

array([ 7484,  6413,  8864, ...,   945,   139, 17578])

In [12]:
#creating an array for validation input 
validationX = [validationUser, validationMovie]
validationX

[array([7411, 2024, 9999, ..., 7697, 4064, 3981]),
 array([ 7484,  6413,  8864, ...,   945,   139, 17578])]

In [13]:
#creating an array for validation output 
validationY = validationDf[['Rating']].values.astype(np.float32)
validationY = validationY.reshape((validationY.shape[0]))
validationY

array([0.8, 0.5, 0.8, ..., 0.8, 0.6, 0. ], dtype=float32)

In [14]:
#Reading Qualified RatingBased UnratedSample test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/RatingBased/UnratedSample/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Rating
0,2557,168,1.0
1,7470,242,0.6
2,6915,1400,0.8
3,1066,54,0.4
4,2037,159,0.4
...,...,...,...
125094,5060,2460,0.7
125095,7302,701,0.7
125096,7724,215,0.1
125097,718,1117,0.6


In [15]:
#creating a user array for test input 
testUser = testDf[['UserId']].values.astype(np.int32)
testUser = testUser.reshape((testUser.shape[0]))
testUser

array([2557, 7470, 6915, ..., 7724,  718, 7349])

In [16]:
#creating a movie array for test input 
testMovie = testDf[['MovieId']].values.astype(np.int32)
testMovie = testMovie.reshape((testMovie.shape[0]))
testMovie

array([ 168,  242, 1400, ...,  215, 1117,  416])

In [17]:
#creating an array for test input 
testX = [testUser, testMovie]
testX

[array([2557, 7470, 6915, ..., 7724,  718, 7349]),
 array([ 168,  242, 1400, ...,  215, 1117,  416])]

In [18]:
#creating an array for test output 
testY = testDf[['Rating']].values.astype(np.float32)
testY = testY.reshape((testY.shape[0]))
testY

array([1. , 0.6, 0.8, ..., 0.1, 0.6, 1. ], dtype=float32)

In [19]:
#ignore warnings due to Converting sparse IndexedSlices to a dense Tensor of unknown shape warning
warnings.filterwarnings('ignore')

#Best Model For RatingBased UnratedSample dataset loading from h5 file
#See Training8 notebook for more information
model = tf.keras.models.load_model("../Model/UnratedSampleModel/Model9.h5")

In [20]:
#Best Model evaluating with training values
with tf.device('/GPU:0'):
    trainingResult = model.evaluate(x = trainingX, y = trainingY, batch_size = 64, verbose = 0)

In [21]:
#Print trainingResult
trainingResult

[0.025242622517412397,
 0.025198441,
 0.15874016,
 0.010300158,
 0.11821742,
 3273819.8]

In [22]:
#Best Model evaluating with validation values
with tf.device('/GPU:0'):
    validationResult = model.evaluate(x = validationX, y = validationY, batch_size = 64, verbose = 0)

In [23]:
#Print validationResult
validationResult

[0.03395902040571975,
 0.03391503,
 0.18416034,
 0.014048368,
 0.13675256,
 4935205.5]

In [24]:
#Best Model evaluating with test values
with tf.device('/GPU:0'):
    testResult = model.evaluate(x = testX, y = testY, batch_size = 64, verbose = 0)

In [25]:
#Print testResult
testResult

[0.033637687197022134,
 0.033593655,
 0.18328571,
 0.013907214,
 0.13622218,
 4838788.5]

In [26]:
#Creating DataFrame for data and model metrics
modelDf = pd.DataFrame(data = {'ModelData': pd.Series(['Training', 'Validation', 'Test'], dtype='str'),
                               'Loss': pd.Series([trainingResult[0], validationResult[0], testResult[0]], dtype='float'),
                               'Mse': pd.Series([trainingResult[1], validationResult[1], testResult[1]], dtype='float'),
                               'Rmse': pd.Series([trainingResult[2], validationResult[2], testResult[2]], dtype='float'),
                               'Msle': pd.Series([trainingResult[3], validationResult[3], testResult[3]], dtype='float'),
                               'Mae': pd.Series([trainingResult[4], validationResult[4], testResult[4]], dtype='float'),
                               'Mape': pd.Series([trainingResult[5], validationResult[5], testResult[5]], dtype='float')})
modelDf

Unnamed: 0,ModelData,Loss,Mse,Rmse,Msle,Mae,Mape
0,Training,0.025243,0.025198,0.15874,0.0103,0.118217,3273819.75
1,Validation,0.033959,0.033915,0.18416,0.014048,0.136753,4935205.5
2,Test,0.033638,0.033594,0.183286,0.013907,0.136222,4838788.5


In [27]:
#Saving modeldf and Clearing memory
modelDf.to_pickle("../PredictData/ModelTable/UnratedSample.pkl")
del modelDf

In [28]:
#Best Model predicting training Ratings
with tf.device('/GPU:0'):
    predictTraining = model.predict(x = trainingX)
predictTraining

array([[0.48662102],
       [0.27478498],
       [0.62577933],
       ...,
       [0.7395843 ],
       [0.8074235 ],
       [0.71187705]], dtype=float32)

In [29]:
#Predicted Ratings appending trainingDf
trainingDf['RatingPredict'] = predictTraining
trainingDf

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,5567,5871,0.6,0.486621
1,8242,9624,0.1,0.274785
2,4691,285,0.6,0.625779
3,6937,2827,0.6,0.774741
4,6948,4235,1.0,0.411200
...,...,...,...,...
760698,939,838,1.0,0.745899
760699,3757,806,0.4,0.702293
760700,7723,5315,0.8,0.739584
760701,4152,1670,0.8,0.807423


In [30]:
#Best Model predicting validation Ratings
with tf.device('/GPU:0'):
    predictValidation = model.predict(x = validationX)
predictValidation

array([[0.508759  ],
       [0.28457737],
       [0.7178873 ],
       ...,
       [0.60232294],
       [0.6699574 ],
       [0.5437133 ]], dtype=float32)

In [31]:
#Predicted Ratings appending validationDf
validationDf['RatingPredict'] = predictValidation
validationDf

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,7411,7484,0.8,0.508759
1,2024,6413,0.5,0.284577
2,9999,8864,0.8,0.717887
3,1522,626,0.7,0.735836
4,4208,566,0.9,0.721125
...,...,...,...,...
125094,3440,224,0.6,0.713267
125095,4940,28,0.7,0.728433
125096,7697,945,0.8,0.602323
125097,4064,139,0.6,0.669957


In [32]:
#Best Model predicting test Ratings
with tf.device('/GPU:0'):
    predictTest = model.predict(x = testX)
predictTest

array([[0.96827376],
       [0.5825507 ],
       [0.8801249 ],
       ...,
       [0.5020613 ],
       [0.73383987],
       [0.9608302 ]], dtype=float32)

In [33]:
#Predicted Ratings appending testDf
testDf['RatingPredict'] = predictTest
testDf

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,2557,168,1.0,0.968274
1,7470,242,0.6,0.582551
2,6915,1400,0.8,0.880125
3,1066,54,0.4,0.824237
4,2037,159,0.4,0.549943
...,...,...,...,...
125094,5060,2460,0.7,0.694268
125095,7302,701,0.7,0.790328
125096,7724,215,0.1,0.502061
125097,718,1117,0.6,0.733840


In [34]:
#Dataframes merging
allDataFrame = pd.concat([trainingDf, validationDf, testDf]).reset_index(drop=True)
allDataFrame

Unnamed: 0,UserId,MovieId,Rating,RatingPredict
0,5567,5871,0.6,0.486621
1,8242,9624,0.1,0.274785
2,4691,285,0.6,0.625779
3,6937,2827,0.6,0.774741
4,6948,4235,1.0,0.411200
...,...,...,...,...
1010896,5060,2460,0.7,0.694268
1010897,7302,701,0.7,0.790328
1010898,7724,215,0.1,0.502061
1010899,718,1117,0.6,0.733840


In [35]:
#Getting all unique UserId from allDataFrame and create new Data Frame from this values
#This Data frame will used as lookup table for prediction script
userRange = range(allDataFrame['UserId'].unique().shape[0])
lookupTable = pd.DataFrame(data = {'User' : userRange})
lookupTable

Unnamed: 0,User
0,0
1,1
2,2
3,3
4,4
...,...
10068,10068
10069,10069
10070,10070
10071,10071


In [36]:
#Calculations for each user
calculations = np.empty(shape = (8, lookupTable.shape[0]))
for i in userRange:
    
    #Training dataframe calculations
    queryFrame = trainingDf[trainingDf['UserId'] == i]
    calculations[0][i] = queryFrame.shape[0]
    if calculations[0][i] > 0:
        calculations[1][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[1][i] = np.nan
        
    #Validaiton dataframe calculations
    queryFrame = validationDf[validationDf['UserId'] == i]
    calculations[2][i] = queryFrame.shape[0]
    if calculations[2][i] > 0:
        calculations[3][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[3][i] = np.nan
    
    #test dataframe calculations
    queryFrame = testDf[testDf['UserId'] == i]
    calculations[4][i] = queryFrame.shape[0]
    if calculations[4][i] > 0:
        calculations[5][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[5][i] = np.nan
    
    #all dataframe calculations
    queryFrame = allDataFrame[allDataFrame['UserId'] == i]
    calculations[6][i] = queryFrame.shape[0]
    if calculations[6][i] > 0:
        calculations[7][i] = CalculateMSE(queryFrame['Rating'].values, queryFrame['RatingPredict'].values)
    else:
        calculations[7][i] = np.nan

#print
calculations

array([[1.40000000e+01, 9.00000000e+00, 7.00000000e+00, ...,
        2.90000000e+01, 3.60000000e+01, 7.21000000e+02],
       [3.94168120e-02, 7.00260498e-03, 5.23153031e-03, ...,
        2.37067217e-02, 2.15041978e-02, 1.09779607e-02],
       [3.00000000e+00, 3.00000000e+00, 3.00000000e+00, ...,
        3.00000000e+00, 4.00000000e+00, 1.38000000e+02],
       ...,
       [1.27656424e-02, 6.88059285e-02, 3.37427079e-02, ...,
        1.94778854e-02, 1.56523457e-02, 1.05406352e-02],
       [1.90000000e+01, 1.60000000e+01, 1.10000000e+01, ...,
        3.80000000e+01, 4.30000000e+01, 9.73000000e+02],
       [3.65914463e-02, 2.32352559e-02, 2.78204336e-02, ...,
        4.90352643e-02, 2.23597922e-02, 1.05265103e-02]])

In [37]:
#Calculations adding lookup table

#Training data representation count
lookupTable['TraingingRep'] = calculations[0].astype(np.int32)
#Training data Correct Predictions
lookupTable['TrainingMSE'] = calculations[1]

#Validation data representation count
lookupTable['ValidationRep'] = calculations[2].astype(np.int32)
#validation data Correct Predictions
lookupTable['ValidationMSE'] = calculations[3]

#Test data representation count
lookupTable['TestRep'] = calculations[4].astype(np.int32)
#test data Correct Predictions
lookupTable['TestMSE'] = calculations[5]

#All data representation count
lookupTable['AllRep'] = calculations[6].astype(np.int32)
#all data Correct Predictions
lookupTable['AllMSE'] = calculations[7]

lookupTable

Unnamed: 0,User,TraingingRep,TrainingMSE,ValidationRep,ValidationMSE,TestRep,TestMSE,AllRep,AllMSE
0,0,14,0.039417,3,0.039290,2,0.012766,19,0.036591
1,1,9,0.007003,3,0.011172,4,0.068806,16,0.023235
2,2,7,0.005232,3,0.078554,1,0.033743,11,0.027820
3,3,534,0.032893,97,0.038765,107,0.032661,738,0.033631
4,4,61,0.007558,5,0.002593,7,0.009588,73,0.007413
...,...,...,...,...,...,...,...,...,...
10068,10068,12,0.017094,4,0.003066,3,0.081436,19,0.024300
10069,10069,18,0.018858,3,0.012827,4,0.015387,25,0.017579
10070,10070,29,0.023707,3,0.352993,6,0.019478,38,0.049035
10071,10071,36,0.021504,4,0.035091,3,0.015652,43,0.022360


In [38]:
#Lookup table save as pkl file
lookupTable.to_pickle("../PredictData/LookupTable/UnratedSample.pkl")
del calculations
del trainingDf
del validationDf
del testDf
del allDataFrame
del lookupTable