# Second Data Preparation Process

In this process, data will be read from the pkl file

Rated movies will be labeled as interacted items

Dataframe that has only interacted items will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

New dataframe that has not interacted items will be create

New dataframe will be split into training, validation and test data

Training, validation and test data will be save as pkl file for future uses

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1


In [3]:
#Reading raw rating data from pkl file
df = pd.read_pickle("../Data/pkl/1M/RawData/Rating.pkl")
df

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [4]:
#Rating column rename as Interaction
df.rename(columns={'Rating': 'Interaction'}, inplace=True)
df

Unnamed: 0,UserId,MovieId,Interaction
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [5]:
#DataFrame converting binary interaction table
df['Interaction'] = np.int8(1)
df

Unnamed: 0,UserId,MovieId,Interaction
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
1000888,10072,12665,1
1000889,10072,6417,1
1000890,10072,9689,1
1000891,10072,22032,1


In [6]:
#Checking data type of Interaction column
df['Interaction'].dtype

dtype('int8')

In [7]:
#Checking dataframe
df

Unnamed: 0,UserId,MovieId,Interaction
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
1000888,10072,12665,1
1000889,10072,6417,1
1000890,10072,9689,1
1000891,10072,22032,1


In [8]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [9]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(df, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Interaction
0,9608,768,1
1,7750,2229,1
2,6702,3129,1
3,9108,208,1
4,9684,7155,1
...,...,...,...
750664,1722,1830,1
750665,4394,6984,1
750666,981,3083,1
750667,4624,958,1


In [10]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Interaction
432936,4423,1353,1
352390,3587,1475,1
222469,2173,2718,1
107126,1088,3875,1
379170,3898,4866,1
...,...,...,...
116773,1164,119,1
350326,3553,965,1
855629,8572,5153,1
426883,4362,199,1


In [11]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Interaction
0,718,1903,1
1,2830,3528,1
2,9138,179,1
3,4026,504,1
4,8566,7612,1
...,...,...,...
125107,4291,251,1
125108,7924,122,1
125109,2835,1564,1
125110,2427,10124,1


In [12]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Interaction
0,3331,70,1
1,9119,380,1
2,1411,530,1
3,8846,1582,1
4,4289,309,1
...,...,...,...
125107,5743,3202,1
125108,5950,1535,1
125109,213,4928,1
125110,4972,108,1


In [13]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/BinaryInteraction/InteractedOnly/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/BinaryInteraction/InteractedOnly/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/BinaryInteraction/InteractedOnly/Test.pkl")

In [14]:
#Calculating number of Users
userSize = len(df['UserId'].unique())
userSize

10073

In [15]:
#Calculating number of Movies
movieSize = len(df['MovieId'].unique())
movieSize

22033

In [16]:
#Calculating sample size
sampleSize = len(df) // 3
sampleSize

333631

In [17]:
#appending dataframe sample of not interacted items
counter = 0
while counter < sampleSize:
    rndUser = np.random.randint(userSize)
    rndMovie = np.random.randint(movieSize)
    if not (df[['UserId','MovieId']].values == [rndUser, rndMovie]).all(axis=1).any():
        counter += 1
        df = pd.concat([df, pd.DataFrame([{'UserId' : rndUser, 'MovieId' : rndMovie, 'Interaction' : 0}])], ignore_index=True)
df

Unnamed: 0,UserId,MovieId,Interaction
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
1334519,8576,3189,0
1334520,7734,207,0
1334521,4637,9821,0
1334522,6834,3946,0


In [18]:
#Check if any duplicate user-movie pairs exist in datframe
df[['UserId', 'MovieId']].duplicated().any()

False

In [19]:
#Data is splitting as training data and test data
trainingDf, testDf = train_test_split(df, test_size=0.25)
trainingDf.reset_index(drop=True, inplace = True)
trainingDf

Unnamed: 0,UserId,MovieId,Interaction
0,234,3490,1
1,8276,2698,1
2,6124,18989,1
3,9040,4706,1
4,3843,1576,1
...,...,...,...
1000888,9095,2752,1
1000889,2024,4478,1
1000890,2464,15523,0
1000891,1578,1857,0


In [20]:
#Checking test data frame
testDf

Unnamed: 0,UserId,MovieId,Interaction
1080242,7919,19732,0
130745,1274,1340,1
550165,5589,1339,1
1293610,9020,16927,0
1250705,772,15071,0
...,...,...,...
8233,80,2740,1
113460,1132,215,1
431934,4405,1038,1
1210590,5342,15996,0


In [21]:
#Test data is splitting as validation data and test data
validationDf, testDf = train_test_split(testDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Interaction
0,4329,1456,1
1,1276,13547,0
2,4325,421,0
3,5368,5840,1
4,2501,80,1
...,...,...,...
166810,1023,17637,0
166811,7704,8313,1
166812,537,4776,0
166813,9358,385,1


In [22]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Interaction
0,7677,2663,1
1,7202,741,1
2,2102,919,1
3,4105,1046,1
4,7207,165,1
...,...,...,...
166811,8917,62,1
166812,8582,227,1
166813,803,761,1
166814,4604,693,1


In [23]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/BinaryInteraction/NotInteractedSample/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/BinaryInteraction/NotInteractedSample/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/BinaryInteraction/NotInteractedSample/Test.pkl")