# Third Data Preparation Process

In this process, data will be read from the pkl file

Rated movies will be labeled as not liked - liked items

Dataframe that has only interacted items will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

New dataframe that has not interacted items will be create. Not interacted items will be labeled as not liked

New dataframe will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1


In [3]:
#Reading raw rating data from pkl file
df = pd.read_pickle("../Data/pkl/1M/RawData/Rating.pkl")
df

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [4]:
#Rating column rename as Like
df.rename(columns={'Rating': 'Like'}, inplace=True)
df

Unnamed: 0,UserId,MovieId,Like
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [5]:
#DataFrame converting binary Like table
#Less than 3.0 rating labeled as not liked others labeled as liked
df['Like'] = df['Like'].apply(lambda x: 0 if x < 3.0 else 1).astype(np.int8)
df

Unnamed: 0,UserId,MovieId,Like
0,0,0,1
1,0,1,1
2,0,2,0
3,0,3,1
4,0,4,1
...,...,...,...
1000888,10072,12665,1
1000889,10072,6417,1
1000890,10072,9689,1
1000891,10072,22032,1


In [6]:
#Checking data type of Interaction column
df['Like'].dtype

dtype('int8')

In [7]:
#Checking dataframe
df

Unnamed: 0,UserId,MovieId,Like
0,0,0,1
1,0,1,1
2,0,2,0
3,0,3,1
4,0,4,1
...,...,...,...
1000888,10072,12665,1
1000889,10072,6417,1
1000890,10072,9689,1
1000891,10072,22032,1


In [8]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [9]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(df, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,5874,4300,1
1,3233,1606,1
2,6434,8427,1
3,3,716,1
4,250,1166,1
...,...,...,...
750664,9521,198,1
750665,1867,668,1
750666,143,1145,1
750667,6423,2270,1


In [10]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Like
114197,1139,668,1
531701,5395,786,1
826977,8317,6204,1
679529,6835,1681,0
468317,4795,785,1
...,...,...,...
893885,8945,510,1
814073,8207,2356,1
386208,3969,484,1
38786,376,1589,1


In [11]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Like
0,426,2667,1
1,2492,376,1
2,4214,2352,1
3,50,1821,1
4,1751,258,1
...,...,...,...
125107,6943,922,1
125108,8234,416,1
125109,9426,19,1
125110,5977,1441,1


In [12]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Like
0,4253,34,1
1,5162,660,1
2,5500,401,1
3,1463,770,1
4,9813,882,0
...,...,...,...
125107,938,106,0
125108,1922,3903,0
125109,5746,2848,0
125110,7322,106,0


In [13]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/BinaryLike/ObservedOnly/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/BinaryLike/ObservedOnly/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/BinaryLike/ObservedOnly/Test.pkl")

In [14]:
#Calculating number of Users
userSize = len(df['UserId'].unique())
userSize

10073

In [15]:
#Calculating number of Movies
movieSize = len(df['MovieId'].unique())
movieSize

22033

In [16]:
#Calculating sample size
#Chosing small sample size becouse dataset already has not liked data as 0 values
sampleSize = len(df) // 100
sampleSize

10008

In [17]:
#appending dataframe sample of Unobserved items as not liked
counter = 0
while counter < sampleSize:
    rndUser = np.random.randint(userSize)
    rndMovie = np.random.randint(movieSize)
    if not (df[['UserId','MovieId']].values == [rndUser, rndMovie]).all(axis=1).any():
        counter += 1
        df = pd.concat([df, pd.DataFrame([{'UserId' : rndUser, 'MovieId' : rndMovie, 'Like' : 0}])], ignore_index=True)
df

Unnamed: 0,UserId,MovieId,Like
0,0,0,1
1,0,1,1
2,0,2,0
3,0,3,1
4,0,4,1
...,...,...,...
1010896,8475,15581,0
1010897,9031,7559,0
1010898,2655,6061,0
1010899,7030,10230,0


In [18]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [19]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(df, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,8085,441,1
1,7283,274,1
2,9134,2339,1
3,8584,689,1
4,3967,62,1
...,...,...,...
758170,5350,142,1
758171,8242,6928,0
758172,4026,3310,0
758173,2071,2642,1


In [20]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Like
332747,3324,3857,1
36354,350,121,1
486232,4925,5339,0
981788,9877,5643,1
191193,1932,12062,1
...,...,...,...
205713,2039,8919,1
668167,6726,64,1
737118,7463,2265,1
661779,6657,2384,1


In [21]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Like
0,8223,750,1
1,4208,167,0
2,8140,575,1
3,9649,1182,1
4,4434,1534,1
...,...,...,...
126358,1328,4126,1
126359,6387,132,1
126360,5987,1489,1
126361,6205,2649,1


In [22]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Like
0,6835,750,1
1,9439,3137,0
2,5695,1336,1
3,9308,548,1
4,8489,1308,1
...,...,...,...
126358,3069,882,0
126359,8357,978,1
126360,6335,786,1
126361,7604,5,0


In [23]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/BinaryLike/UnobservedSample/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/BinaryLike/UnobservedSample/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/BinaryLike/UnobservedSample/Test.pkl")