# Quailty of BinaryLike Datasets

In this process, Quality of both BinaryLike datasets will be checked and if it is necessary it will improved

Null values and duplicates will be checked

User and Movie size will be checked

The representativeness of the training dataset will be checked and improved.

Organized data will be save as pkl file for future use

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1


In [3]:
#Reading raw rating data from pkl file
ratingDf = pd.read_pickle("../Data/pkl/1M/RawData/Rating.pkl")
ratingDf

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [4]:
#Checking is there any nan value in ratingDf
ratingDf.isnull().values.any()

False

In [5]:
#Check if any duplicate user-movie pairs exist in datframe
ratingDf[['UserId', 'MovieId']].duplicated().any()

False

In [6]:
#Calculating number of users in raw data
userSize = ratingDf['UserId'].unique().shape[0]
userSize

10073

In [7]:
#Calculating number of movies in raw data
movieSize = ratingDf['MovieId'].unique().shape[0]
movieSize

22033

In [8]:
#Reading raw movie data from pkl file
movie = pd.read_pickle("../Data/pkl/1M/RawData/Movie.pkl")
movie

Unnamed: 0,MovieId,Title
0,0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
1,1,Kalifornia (1993)
2,2,Weekend at Bernie's (1989)
3,3,Better Off Dead... (1985)
4,4,Waiting for Guffman (1996)
...,...,...
22028,22028,London Paris New York (2012)
22029,22029,Wild Zero (2000)
22030,22030,Mr. Accident (2000)
22031,22031,Max Steel (2016)


In [9]:
#Checking if movieSize is true
movie.shape[0]

22033

In [10]:
#Reading BinaryLike ObservedOnly training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/BinaryLike/ObservedOnly/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,5874,4300,1
1,3233,1606,1
2,6434,8427,1
3,3,716,1
4,250,1166,1
...,...,...,...
750664,9521,198,1
750665,1867,668,1
750666,143,1145,1
750667,6423,2270,1


In [11]:
#Reading BinaryLike ObservedOnly validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/BinaryLike/ObservedOnly/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Like
0,426,2667,1
1,2492,376,1
2,4214,2352,1
3,50,1821,1
4,1751,258,1
...,...,...,...
125107,6943,922,1
125108,8234,416,1
125109,9426,19,1
125110,5977,1441,1


In [12]:
#Reading BinaryLike ObservedOnly test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/BinaryLike/ObservedOnly/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Like
0,4253,34,1
1,5162,660,1
2,5500,401,1
3,1463,770,1
4,9813,882,0
...,...,...,...
125107,938,106,0
125108,1922,3903,0
125109,5746,2848,0
125110,7322,106,0


In [13]:
#Generating totalTestDf that include test and validation data
totalTestDf = pd.concat([validationDf, testDf], ignore_index = True)
totalTestDf

Unnamed: 0,UserId,MovieId,Like
0,426,2667,1
1,2492,376,1
2,4214,2352,1
3,50,1821,1
4,1751,258,1
...,...,...,...
250219,938,106,0
250220,1922,3903,0
250221,5746,2848,0
250222,7322,106,0


In [14]:
#Since represent users in training dataset more important than represent them in test dataset
#For each unique user appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(userSize):
    elementTrainingSize = trainingDf[trainingDf['UserId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['UserId'] == i].shape[0]
        
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['UserId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Like])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Like
0,426,2667,1
1,2492,376,1
2,4214,2352,1
3,50,1821,1
4,1751,258,1
...,...,...,...
249986,938,106,0
249987,1922,3903,0
249988,5746,2848,0
249989,7322,106,0


In [15]:
len(rowList)

233

In [16]:
#Adding unrepresented users to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Like'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,5874,4300,1
1,3233,1606,1
2,6434,8427,1
3,3,716,1
4,250,1166,1
...,...,...,...
750897,9740,410,1
750898,9748,862,1
750899,9748,1125,1
750900,9826,10678,1


In [17]:
#Checking is there at least 1 input for each user in training data
trainingDf['UserId'].unique().shape[0]

10073

In [18]:
#Since represent movies in training dataset more important than represent them in test dataset
#For each unique movie appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(movieSize):
    elementTrainingSize = trainingDf[trainingDf['MovieId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['MovieId'] == i].shape[0]
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['MovieId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Like])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Like
0,426,2667,1
1,2492,376,1
2,4214,2352,1
3,50,1821,1
4,1751,258,1
...,...,...,...
247334,938,106,0
247335,1922,3903,0
247336,5746,2848,0
247337,7322,106,0


In [19]:
len(rowList)

2652

In [20]:
#Adding unrepresented movies to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Like'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,5874,4300,1
1,3233,1606,1
2,6434,8427,1
3,3,716,1
4,250,1166,1
...,...,...,...
753549,10026,22013,0
753550,10037,22016,1
753551,10037,22019,1
753552,10059,22030,0


In [21]:
#Shuffle trainingDf
trainingDf = trainingDf.sample(frac=1).reset_index(drop=True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,30,882,1
1,2836,119,1
2,6380,1230,0
3,9569,1646,1
4,5301,797,1
...,...,...,...
753549,9027,1666,0
753550,1694,88,1
753551,8853,42,1
753552,1756,2598,1


In [22]:
#Checking is there at least 1 input for each movie in training data
trainingDf['MovieId'].unique().shape[0]

22033

In [23]:
#totalTest data is splitting as validation data and test data
validationDf, testDf = train_test_split(totalTestDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Like
0,2570,197,1
1,9495,2601,1
2,4747,658,1
3,8572,202,0
4,5334,1178,1
...,...,...,...
123664,9721,1914,1
123665,4293,1472,1
123666,7198,277,1
123667,2699,6436,1


In [24]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Like
0,8193,163,0
1,2328,2593,1
2,457,1796,0
3,6870,1818,1
4,600,149,1
...,...,...,...
123665,1062,105,1
123666,3176,2296,1
123667,7009,376,1
123668,4864,4898,1


In [25]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Test.pkl")

In [26]:
#Reading Qualified BinaryLike ObservedOnly training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,30,882,1
1,2836,119,1
2,6380,1230,0
3,9569,1646,1
4,5301,797,1
...,...,...,...
753549,9027,1666,0
753550,1694,88,1
753551,8853,42,1
753552,1756,2598,1


In [27]:
#Reading Qualified BinaryLike ObservedOnly validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Like
0,2570,197,1
1,9495,2601,1
2,4747,658,1
3,8572,202,0
4,5334,1178,1
...,...,...,...
123664,9721,1914,1
123665,4293,1472,1
123666,7198,277,1
123667,2699,6436,1


In [28]:
#Reading Qualified BinaryLike ObservedOnly test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/ObservedOnly/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Like
0,8193,163,0
1,2328,2593,1
2,457,1796,0
3,6870,1818,1
4,600,149,1
...,...,...,...
123665,1062,105,1
123666,3176,2296,1
123667,7009,376,1
123668,4864,4898,1


In [29]:
#Generating df that include training, test and validation data
df = pd.concat([trainingDf, validationDf, testDf], ignore_index = True)
df

Unnamed: 0,UserId,MovieId,Like
0,30,882,1
1,2836,119,1
2,6380,1230,0
3,9569,1646,1
4,5301,797,1
...,...,...,...
1000888,1062,105,1
1000889,3176,2296,1
1000890,7009,376,1
1000891,4864,4898,1


In [30]:
#Checking is there any nan value in df
df.isnull().values.any()

False

In [31]:
#Check if any duplicate user-movie pairs exist in df
df[['UserId', 'MovieId']].duplicated().any()

False

In [32]:
#Calculating number of users in Qualified Data
df['UserId'].unique().shape[0]

10073

In [33]:
#Calculating number of movies in Qualified Data
df['MovieId'].unique().shape[0]

22033

In [34]:
#Reading BinaryLike UnobservedSample training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/BinaryLike/UnobservedSample/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,8085,441,1
1,7283,274,1
2,9134,2339,1
3,8584,689,1
4,3967,62,1
...,...,...,...
758170,5350,142,1
758171,8242,6928,0
758172,4026,3310,0
758173,2071,2642,1


In [35]:
#Reading BinaryLike UnobservedSample validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/BinaryLike/UnobservedSample/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Like
0,8223,750,1
1,4208,167,0
2,8140,575,1
3,9649,1182,1
4,4434,1534,1
...,...,...,...
126358,1328,4126,1
126359,6387,132,1
126360,5987,1489,1
126361,6205,2649,1


In [36]:
#Reading BinaryLike UnobservedSample test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/BinaryLike/UnobservedSample/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Like
0,6835,750,1
1,9439,3137,0
2,5695,1336,1
3,9308,548,1
4,8489,1308,1
...,...,...,...
126358,3069,882,0
126359,8357,978,1
126360,6335,786,1
126361,7604,5,0


In [37]:
#Generating totalTestDf that include test and validation data
totalTestDf = pd.concat([validationDf, testDf], ignore_index = True)
totalTestDf

Unnamed: 0,UserId,MovieId,Like
0,8223,750,1
1,4208,167,0
2,8140,575,1
3,9649,1182,1
4,4434,1534,1
...,...,...,...
252721,3069,882,0
252722,8357,978,1
252723,6335,786,1
252724,7604,5,0


In [38]:
#Since represent users in training dataset more important than represent them in test dataset
#For each unique user appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(userSize):
    elementTrainingSize = trainingDf[trainingDf['UserId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['UserId'] == i].shape[0]
        
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['UserId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Like])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Like
0,8223,750,1
1,4208,167,0
2,8140,575,1
3,9649,1182,1
4,4434,1534,1
...,...,...,...
252529,3069,882,0
252530,8357,978,1
252531,6335,786,1
252532,7604,5,0


In [39]:
len(rowList)

192

In [40]:
#Adding unrepresented users to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Like'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,8085,441,1
1,7283,274,1
2,9134,2339,1
3,8584,689,1
4,3967,62,1
...,...,...,...
758362,9728,110,1
758363,9728,154,1
758364,10008,110,1
758365,10008,15,1


In [41]:
#Checking is there at least 1 input for each user in training data
trainingDf['UserId'].unique().shape[0]

10073

In [42]:
#Since represent movies in training dataset more important than represent them in test dataset
#For each unique movie appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(movieSize):
    elementTrainingSize = trainingDf[trainingDf['MovieId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['MovieId'] == i].shape[0]
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['MovieId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Like])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Like
0,8223,750,1
1,4208,167,0
2,8140,575,1
3,9649,1182,1
4,4434,1534,1
...,...,...,...
250143,3069,882,0
250144,8357,978,1
250145,6335,786,1
250146,7604,5,0


In [43]:
len(rowList)

2386

In [44]:
#Adding unrepresented movies to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Like'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,8085,441,1
1,7283,274,1
2,9134,2339,1
3,8584,689,1
4,3967,62,1
...,...,...,...
760748,9999,22008,1
760749,10026,22013,0
760750,10037,22017,1
760751,10037,22018,1


In [45]:
#Shuffle trainingDf
trainingDf = trainingDf.sample(frac=1).reset_index(drop=True)
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,6741,2320,1
1,129,777,0
2,5175,8304,1
3,1133,1334,1
4,8109,2270,1
...,...,...,...
760748,2386,168,1
760749,696,1784,1
760750,9237,105,1
760751,7232,204,1


In [46]:
#Checking is there at least 1 input for each movie in training data
trainingDf['MovieId'].unique().shape[0]

22033

In [47]:
#totalTest data is splitting as validation data and test data
validationDf, testDf = train_test_split(totalTestDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Like
0,8039,20761,0
1,1504,121,1
2,6126,5947,1
3,3371,106,1
4,7280,578,1
...,...,...,...
125069,313,885,1
125070,4108,215,1
125071,5532,863,1
125072,4202,4403,1


In [48]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Like
0,3969,156,1
1,6121,18850,1
2,7735,214,1
3,5553,1387,1
4,80,2775,0
...,...,...,...
125069,3249,15383,0
125070,5521,123,1
125071,6506,10906,1
125072,9290,2456,1


In [49]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Test.pkl")

In [50]:
#Reading Qualified BinaryLike UnobservedSample training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Like
0,6741,2320,1
1,129,777,0
2,5175,8304,1
3,1133,1334,1
4,8109,2270,1
...,...,...,...
760748,2386,168,1
760749,696,1784,1
760750,9237,105,1
760751,7232,204,1


In [51]:
#Reading Qualified BinaryLike UnobservedSample validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Like
0,8039,20761,0
1,1504,121,1
2,6126,5947,1
3,3371,106,1
4,7280,578,1
...,...,...,...
125069,313,885,1
125070,4108,215,1
125071,5532,863,1
125072,4202,4403,1


In [52]:
#Reading Qualified BinaryLike UnobservedSample test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/BinaryLike/UnobservedSample/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Like
0,3969,156,1
1,6121,18850,1
2,7735,214,1
3,5553,1387,1
4,80,2775,0
...,...,...,...
125069,3249,15383,0
125070,5521,123,1
125071,6506,10906,1
125072,9290,2456,1


In [53]:
#Generating df that include training, test and validation data
df = pd.concat([trainingDf, validationDf, testDf], ignore_index = True)
df

Unnamed: 0,UserId,MovieId,Like
0,6741,2320,1
1,129,777,0
2,5175,8304,1
3,1133,1334,1
4,8109,2270,1
...,...,...,...
1010896,3249,15383,0
1010897,5521,123,1
1010898,6506,10906,1
1010899,9290,2456,1


In [54]:
#Checking is there any nan value in df
df.isnull().values.any()

False

In [55]:
#Check if any duplicate user-movie pairs exist in df
df[['UserId', 'MovieId']].duplicated().any()

False

In [56]:
#Calculating number of users in Qualified Data
df['UserId'].unique().shape[0]

10073

In [57]:
#Calculating number of movies in Qualified Data
df['MovieId'].unique().shape[0]

22033