# Fifth Data Preparation Process

In this process, data will be read from the pkl file

Ratings will be normalized using min max normalization

Dataframe that has only interacted items will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

New dataframe that has not interacted items will be create. Not interacted items' ratings will be set as 0

Ratings of new dataframe will be normalized using min max normalization

New dataframe will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1


In [3]:
#Reading raw rating data from pkl file
df = pd.read_pickle("../Data/pkl/1M/RawData/Rating.pkl")
df

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [4]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [5]:
#Getting minimum rating
minRating = min(df['Rating'])
minRating

0.5

In [6]:
#Getting maximum rating
maxRating = max(df['Rating'])
maxRating

5.0

In [7]:
#Copy dataframe
tempDf = df.copy()
tempDf

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [8]:
#Min-Max normalization for scaling rating [0, 1]
tempDf['Rating'] = (tempDf['Rating'] - minRating) / (maxRating - minRating)
tempDf

Unnamed: 0,UserId,MovieId,Rating
0,0,0,0.666667
1,0,1,0.666667
2,0,2,0.222222
3,0,3,0.888889
4,0,4,0.888889
...,...,...,...
1000888,10072,12665,0.555556
1000889,10072,6417,0.555556
1000890,10072,9689,0.555556
1000891,10072,22032,0.555556


In [9]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(tempDf, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Rating
0,718,3798,0.444444
1,5676,390,0.777778
2,2605,793,0.666667
3,8572,2762,0.000000
4,8354,1519,0.555556
...,...,...,...
750664,1926,3513,1.000000
750665,1926,215,1.000000
750666,8035,4518,0.111111
750667,7862,178,1.000000


In [10]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Rating
563008,5695,707,0.777778
952677,9569,1213,0.888889
398798,4070,356,0.666667
359768,3664,29,0.555556
744548,7584,528,0.777778
...,...,...,...
554983,5625,647,0.444444
510852,5181,853,0.888889
38672,375,727,0.555556
419945,4291,473,0.333333


In [11]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Rating
0,3831,3236,0.555556
1,9346,416,1.000000
2,3958,7048,0.222222
3,8807,586,0.777778
4,2024,4725,1.000000
...,...,...,...
125107,4186,851,0.555556
125108,7752,1601,0.777778
125109,9995,1564,0.888889
125110,2740,1248,1.000000


In [12]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Rating
0,7704,3933,0.222222
1,8640,759,0.777778
2,9964,306,0.555556
3,9489,1352,0.666667
4,6124,540,0.444444
...,...,...,...
125107,5356,4799,0.111111
125108,9730,13943,0.666667
125109,8653,1335,0.555556
125110,1896,1796,0.777778


In [13]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/RatingBased/RatedOnly/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/RatingBased/RatedOnly/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/RatingBased/RatedOnly/Test.pkl")

In [14]:
#Calculating number of Users
userSize = len(df['UserId'].unique())
userSize

10073

In [15]:
#Calculating number of Movies
movieSize = len(df['MovieId'].unique())
movieSize

22033

In [16]:
#Calculating sample size
#since we want predict the rating of unobserved values
#assigning 0 to unobserved ratings can decrease performance of prediction
#So i select just a small sample for see how it is effect the performance 
sampleSize = len(df) // 100
sampleSize

10008

In [17]:
#appending dataframe sample of not interacted items as 0 rating
counter = 0
while counter < sampleSize:
    rndUser = np.random.randint(userSize)
    rndMovie = np.random.randint(movieSize)
    if not (df[['UserId','MovieId']].values == [rndUser, rndMovie]).all(axis=1).any():
        counter += 1
        df = pd.concat([df, pd.DataFrame([{'UserId' : rndUser, 'MovieId' : rndMovie, 'Rating' : 0}])], ignore_index=True)
df

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1010896,4548,20373,0.0
1010897,2519,1797,0.0
1010898,3277,6877,0.0
1010899,2440,21727,0.0


In [18]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [19]:
#Getting minimum rating
minRating = min(df['Rating'])
minRating

0.0

In [20]:
#Getting maximum rating
maxRating = max(df['Rating'])
maxRating

5.0

In [21]:
#Min-Max normalization for scaling rating [0, 1]
df['Rating'] = (df['Rating'] - minRating) / (maxRating - minRating)
df

Unnamed: 0,UserId,MovieId,Rating
0,0,0,0.7
1,0,1,0.7
2,0,2,0.3
3,0,3,0.9
4,0,4,0.9
...,...,...,...
1010896,4548,20373,0.0
1010897,2519,1797,0.0
1010898,3277,6877,0.0
1010899,2440,21727,0.0


In [22]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(df, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Rating
0,5596,3959,1.0
1,2866,1564,1.0
2,1465,42,0.6
3,2982,862,0.8
4,4795,1443,0.6
...,...,...,...
758170,3387,467,0.4
758171,1371,1092,0.5
758172,5867,781,0.7
758173,8261,1433,0.6


In [23]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Rating
40223,383,1563,0.6
937671,9435,230,0.7
512676,5205,5869,1.0
352486,3588,103,0.6
686759,6886,4009,0.3
...,...,...,...
763912,7743,384,0.6
609514,6196,1732,0.7
332198,3317,2029,0.8
257959,2549,165,1.0


In [24]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Rating
0,4378,764,0.7
1,2655,272,0.8
2,4185,2727,1.0
3,5218,2460,0.7
4,6690,225,0.6
...,...,...,...
126358,3161,15608,0.4
126359,2797,338,0.4
126360,5520,1308,0.8
126361,6886,179,0.8


In [25]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Rating
0,5339,726,0.7
1,6121,389,0.9
2,6344,42,0.6
3,234,3126,1.0
4,1119,253,0.5
...,...,...,...
126358,9435,617,1.0
126359,8615,1065,0.8
126360,5746,18474,0.3
126361,6935,156,0.5


In [26]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/RatingBased/UnratedSample/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/RatingBased/UnratedSample/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/RatingBased/UnratedSample/Test.pkl")