# Fourth Data Preparation Process

In this process, data will be read from the pkl file

Rated movies will be labeled as [0, 1, 2] - ['Hated', 'Not Liked', 'Liked']

[0, 2) Rating -> Hated, [2, 4) Rating -> Not Liked; [4, 5] Rating -> Liked

Dataframe that has only interacted items will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

New dataframe that has not interacted items will be create. Not interacted items will be labeled as Hated

New dataframe will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1


In [3]:
#Reading raw rating data from pkl file
df = pd.read_pickle("../Data/pkl/1M/RawData/Rating.pkl")
df

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [4]:
#Rating column rename as Category
df.rename(columns={'Rating': 'Category'}, inplace=True)
df

Unnamed: 0,UserId,MovieId,Category
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [5]:
#DataFrame converting Categorical Like table
#[0, 2) -> Hated, [2, 4) -> Not Liked; [4, 5] -> Liked
df['Category'] = df['Category'].apply(lambda x: x // 2).astype(np.int8)
df

Unnamed: 0,UserId,MovieId,Category
0,0,0,1
1,0,1,1
2,0,2,0
3,0,3,2
4,0,4,2
...,...,...,...
1000888,10072,12665,1
1000889,10072,6417,1
1000890,10072,9689,1
1000891,10072,22032,1


In [6]:
#Checking data type of Interaction column
df['Category'].dtype

dtype('int8')

In [7]:
#Checking dataframe
df

Unnamed: 0,UserId,MovieId,Category
0,0,0,1
1,0,1,1
2,0,2,0
3,0,3,2
4,0,4,2
...,...,...,...
1000888,10072,12665,1
1000889,10072,6417,1
1000890,10072,9689,1
1000891,10072,22032,1


In [8]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [9]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(df, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,4527,690,1
1,1879,718,2
2,4460,773,2
3,4273,1732,2
4,213,1362,1
...,...,...,...
750664,8454,568,0
750665,927,221,1
750666,9406,1620,2
750667,1470,569,2


In [10]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Category
709284,7145,1777,1
101120,1044,3782,2
48041,470,2008,1
463705,4761,1043,2
933861,9362,216,2
...,...,...,...
505751,5158,159,1
496878,5051,2396,2
571722,5750,2187,2
681883,6851,701,1


In [11]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Category
0,4546,668,2
1,8099,934,1
2,5750,287,2
3,378,601,2
4,9461,857,0
...,...,...,...
125107,5572,1323,1
125108,4491,2441,2
125109,490,5365,2
125110,772,121,2


In [12]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Category
0,7845,5355,1
1,535,1334,2
2,1896,126,2
3,7824,1327,2
4,5175,3729,1
...,...,...,...
125107,1190,1583,1
125108,6733,57,2
125109,3335,1681,1
125110,1533,1740,2


In [13]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/CategoricalLike/CategorizedOnly/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/CategoricalLike/CategorizedOnly/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/CategoricalLike/CategorizedOnly/Test.pkl")

In [14]:
#Calculating number of Users
userSize = len(df['UserId'].unique())
userSize

10073

In [15]:
#Calculating number of Movies
movieSize = len(df['MovieId'].unique())
movieSize

22033

In [16]:
#Calculating sample size
#Chosing small sample size becouse dataset already has Hated data as 0 label
sampleSize = len(df) // 100
sampleSize

10008

In [17]:
#appending dataframe sample of Not Categorized items as Hated
counter = 0
while counter < sampleSize:
    rndUser = np.random.randint(userSize)
    rndMovie = np.random.randint(movieSize)
    if not (df[['UserId','MovieId']].values == [rndUser, rndMovie]).all(axis=1).any():
        counter += 1
        df = pd.concat([df, pd.DataFrame([{'UserId' : rndUser, 'MovieId' : rndMovie, 'Category' : 0}])], ignore_index=True)
df

Unnamed: 0,UserId,MovieId,Category
0,0,0,1
1,0,1,1
2,0,2,0
3,0,3,2
4,0,4,2
...,...,...,...
1010896,7402,4277,0
1010897,10029,20304,0
1010898,2875,1636,0
1010899,3522,3455,0


In [18]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [19]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(df, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,7953,1366,2
1,234,416,2
2,4431,133,2
3,2579,164,1
4,8945,1998,2
...,...,...,...
758170,6298,3160,2
758171,55,1124,2
758172,9790,4918,2
758173,4593,666,1


In [20]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Category
690923,6922,1000,0
162935,1617,76,1
272559,2655,236,2
807041,8140,2480,2
627702,6316,1164,1
...,...,...,...
854093,8566,1426,2
171249,1710,263,1
429131,4376,230,2
690203,6919,224,1


In [21]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Category
0,485,5405,2
1,1971,1043,1
2,5322,7248,1
3,4147,1851,1
4,3755,1457,1
...,...,...,...
126358,7636,2734,1
126359,6254,1942,1
126360,7375,400,2
126361,7377,1230,1


In [22]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Category
0,4072,1316,1
1,2798,5033,1
2,2584,767,2
3,9091,2994,1
4,10056,15014,2
...,...,...,...
126358,767,833,1
126359,4311,322,1
126360,7262,19696,2
126361,146,324,1


In [23]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/CategoricalLike/NotCategorizedSample/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/CategoricalLike/NotCategorizedSample/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/CategoricalLike/NotCategorizedSample/Test.pkl")