# Quailty of CategoricalLike Datasets

In this process, Quality of both CategoricalLike datasets will be checked and if it is necessary it will improved

Null values and duplicates will be checked

User and Movie size will be checked

The representativeness of the training dataset will be checked and improved.

Organized data will be save as pkl file for future use

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('pandas Version: ' + pd.__version__)

numpy Version: 1.16.5
pandas Version: 0.25.1


In [3]:
#Reading raw rating data from pkl file
ratingDf = pd.read_pickle("../Data/pkl/1M/RawData/Rating.pkl")
ratingDf

Unnamed: 0,UserId,MovieId,Rating
0,0,0,3.5
1,0,1,3.5
2,0,2,1.5
3,0,3,4.5
4,0,4,4.5
...,...,...,...
1000888,10072,12665,3.0
1000889,10072,6417,3.0
1000890,10072,9689,3.0
1000891,10072,22032,3.0


In [4]:
#Checking is there any nan value in ratingDf
ratingDf.isnull().values.any()

False

In [5]:
#Check if any duplicate user-movie pairs exist in datframe
ratingDf[['UserId', 'MovieId']].duplicated().any()

False

In [6]:
#Calculating number of users in raw data
userSize = ratingDf['UserId'].unique().shape[0]
userSize

10073

In [7]:
#Calculating number of movies in raw data
movieSize = ratingDf['MovieId'].unique().shape[0]
movieSize

22033

In [8]:
#Reading raw movie data from pkl file
movie = pd.read_pickle("../Data/pkl/1M/RawData/Movie.pkl")
movie

Unnamed: 0,MovieId,Title
0,0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
1,1,Kalifornia (1993)
2,2,Weekend at Bernie's (1989)
3,3,Better Off Dead... (1985)
4,4,Waiting for Guffman (1996)
...,...,...
22028,22028,London Paris New York (2012)
22029,22029,Wild Zero (2000)
22030,22030,Mr. Accident (2000)
22031,22031,Max Steel (2016)


In [9]:
#Checking if movieSize is true
movie.shape[0]

22033

In [10]:
#Reading CategoricalLike CategorizedOnly training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/CategoricalLike/CategorizedOnly/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,4527,690,1
1,1879,718,2
2,4460,773,2
3,4273,1732,2
4,213,1362,1
...,...,...,...
750664,8454,568,0
750665,927,221,1
750666,9406,1620,2
750667,1470,569,2


In [11]:
#Reading CategoricalLike CategorizedOnly validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/CategoricalLike/CategorizedOnly/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Category
0,4546,668,2
1,8099,934,1
2,5750,287,2
3,378,601,2
4,9461,857,0
...,...,...,...
125107,5572,1323,1
125108,4491,2441,2
125109,490,5365,2
125110,772,121,2


In [12]:
#Reading CategoricalLike CategorizedOnly test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/CategoricalLike/CategorizedOnly/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Category
0,7845,5355,1
1,535,1334,2
2,1896,126,2
3,7824,1327,2
4,5175,3729,1
...,...,...,...
125107,1190,1583,1
125108,6733,57,2
125109,3335,1681,1
125110,1533,1740,2


In [13]:
#Generating totalTestDf that include test and validation data
totalTestDf = pd.concat([validationDf, testDf], ignore_index = True)
totalTestDf

Unnamed: 0,UserId,MovieId,Category
0,4546,668,2
1,8099,934,1
2,5750,287,2
3,378,601,2
4,9461,857,0
...,...,...,...
250219,1190,1583,1
250220,6733,57,2
250221,3335,1681,1
250222,1533,1740,2


In [14]:
#Since represent users in training dataset more important than represent them in test dataset
#For each unique user appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(userSize):
    elementTrainingSize = trainingDf[trainingDf['UserId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['UserId'] == i].shape[0]
        
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['UserId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Category])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Category
0,4546,668,2
1,8099,934,1
2,5750,287,2
3,378,601,2
4,9461,857,0
...,...,...,...
249949,1190,1583,1
249950,6733,57,2
249951,3335,1681,1
249952,1533,1740,2


In [15]:
len(rowList)

270

In [16]:
#Adding unrepresented users to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Category'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,4527,690,1
1,1879,718,2
2,4460,773,2
3,4273,1732,2
4,213,1362,1
...,...,...,...
750934,9828,215,1
750935,9828,110,2
750936,9884,1575,1
750937,9927,101,2


In [17]:
#Checking is there at least 1 input for each user in training data
trainingDf['UserId'].unique().shape[0]

10073

In [18]:
#Since represent movies in training dataset more important than represent them in test dataset
#For each unique movie appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(movieSize):
    elementTrainingSize = trainingDf[trainingDf['MovieId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['MovieId'] == i].shape[0]
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['MovieId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Category])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Category
0,4546,668,2
1,8099,934,1
2,5750,287,2
3,378,601,2
4,9461,857,0
...,...,...,...
247255,1190,1583,1
247256,6733,57,2
247257,3335,1681,1
247258,1533,1740,2


In [19]:
len(rowList)

2694

In [20]:
#Adding unrepresented movies to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Category'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,4527,690,1
1,1879,718,2
2,4460,773,2
3,4273,1732,2
4,213,1362,1
...,...,...,...
753628,10037,22014,1
753629,10037,22015,0
753630,10037,22016,2
753631,10037,22020,1


In [21]:
#Shuffle trainingDf
trainingDf = trainingDf.sample(frac=1).reset_index(drop=True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,9999,690,1
1,4532,1007,2
2,3969,726,2
3,1271,9157,1
4,7932,1201,1
...,...,...,...
753628,9556,1403,1
753629,4795,17453,1
753630,2116,434,1
753631,1910,4512,1


In [22]:
#Checking is there at least 1 input for each movie in training data
trainingDf['MovieId'].unique().shape[0]

22033

In [23]:
#totalTest data is splitting as validation data and test data
validationDf, testDf = train_test_split(totalTestDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Category
0,238,315,1
1,2311,933,1
2,9877,866,2
3,234,424,2
4,8066,1669,2
...,...,...,...
123625,2038,280,1
123626,8717,154,2
123627,7348,1068,1
123628,1932,12061,1


In [24]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Category
0,1470,861,2
1,2736,789,2
2,8041,1101,2
3,3317,336,1
4,5490,2359,1
...,...,...,...
123625,3246,983,1
123626,8886,2413,2
123627,1487,776,0
123628,7680,97,1


In [25]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/Qualified/CategoricalLike/CategorizedOnly/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/Qualified/CategoricalLike/CategorizedOnly/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/Qualified/CategoricalLike/CategorizedOnly/Test.pkl")

In [26]:
#Reading Qualified CategoricalLike CategorizedOnly training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/CategoricalLike/CategorizedOnly/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,9999,690,1
1,4532,1007,2
2,3969,726,2
3,1271,9157,1
4,7932,1201,1
...,...,...,...
753628,9556,1403,1
753629,4795,17453,1
753630,2116,434,1
753631,1910,4512,1


In [27]:
#Reading Qualified CategoricalLike CategorizedOnly validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/CategoricalLike/CategorizedOnly/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Category
0,238,315,1
1,2311,933,1
2,9877,866,2
3,234,424,2
4,8066,1669,2
...,...,...,...
123625,2038,280,1
123626,8717,154,2
123627,7348,1068,1
123628,1932,12061,1


In [28]:
#Reading Qualified CategoricalLike CategorizedOnly test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/CategoricalLike/CategorizedOnly/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Category
0,1470,861,2
1,2736,789,2
2,8041,1101,2
3,3317,336,1
4,5490,2359,1
...,...,...,...
123625,3246,983,1
123626,8886,2413,2
123627,1487,776,0
123628,7680,97,1


In [29]:
#Generating df that include training, test and validation data
df = pd.concat([trainingDf, validationDf, testDf], ignore_index = True)
df

Unnamed: 0,UserId,MovieId,Category
0,9999,690,1
1,4532,1007,2
2,3969,726,2
3,1271,9157,1
4,7932,1201,1
...,...,...,...
1000888,3246,983,1
1000889,8886,2413,2
1000890,1487,776,0
1000891,7680,97,1


In [30]:
#Checking is there any nan value in df
df.isnull().values.any()

False

In [31]:
#Check if any duplicate user-movie pairs exist in df
df[['UserId', 'MovieId']].duplicated().any()

False

In [32]:
#Calculating number of users in Qualified Data
df['UserId'].unique().shape[0]

10073

In [33]:
#Calculating number of movies in Qualified Data
df['MovieId'].unique().shape[0]

22033

In [34]:
#Reading CategoricalLike NotCategorizedSample training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/CategoricalLike/NotCategorizedSample/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,7953,1366,2
1,234,416,2
2,4431,133,2
3,2579,164,1
4,8945,1998,2
...,...,...,...
758170,6298,3160,2
758171,55,1124,2
758172,9790,4918,2
758173,4593,666,1


In [35]:
#Reading CategoricalLike NotCategorizedSample validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/CategoricalLike/NotCategorizedSample/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Category
0,485,5405,2
1,1971,1043,1
2,5322,7248,1
3,4147,1851,1
4,3755,1457,1
...,...,...,...
126358,7636,2734,1
126359,6254,1942,1
126360,7375,400,2
126361,7377,1230,1


In [36]:
#Reading CategoricalLike NotCategorizedSample test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/CategoricalLike/NotCategorizedSample/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Category
0,4072,1316,1
1,2798,5033,1
2,2584,767,2
3,9091,2994,1
4,10056,15014,2
...,...,...,...
126358,767,833,1
126359,4311,322,1
126360,7262,19696,2
126361,146,324,1


In [37]:
#Generating totalTestDf that include test and validation data
totalTestDf = pd.concat([validationDf, testDf], ignore_index = True)
totalTestDf

Unnamed: 0,UserId,MovieId,Category
0,485,5405,2
1,1971,1043,1
2,5322,7248,1
3,4147,1851,1
4,3755,1457,1
...,...,...,...
252721,767,833,1
252722,4311,322,1
252723,7262,19696,2
252724,146,324,1


In [38]:
#Since represent users in training dataset more important than represent them in test dataset
#For each unique user appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(userSize):
    elementTrainingSize = trainingDf[trainingDf['UserId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['UserId'] == i].shape[0]
        
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['UserId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Category])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Category
0,485,5405,2
1,1971,1043,1
2,5322,7248,1
3,4147,1851,1
4,3755,1457,1
...,...,...,...
252499,767,833,1
252500,4311,322,1
252501,7262,19696,2
252502,146,324,1


In [39]:
len(rowList)

222

In [40]:
#Adding unrepresented users to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Category'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,7953,1366,2
1,234,416,2
2,4431,133,2
3,2579,164,1
4,8945,1998,2
...,...,...,...
758392,9679,161,2
758393,10065,186,1
758394,10065,42,2
758395,10065,148,2


In [41]:
#Checking is there at least 1 input for each user in training data
trainingDf['UserId'].unique().shape[0]

10073

In [42]:
#Since represent movies in training dataset more important than represent them in test dataset
#For each unique movie appending training dataframe more row than totalTest dataframe
#Following loop just calculate which rows deleted from totalTestDf, adding rows a list and delete them from totalTestDf
#totalTestDf printing just for check its situation
rowList = []
removeIndex = []
for i in range(movieSize):
    elementTrainingSize = trainingDf[trainingDf['MovieId'] == i].shape[0]
    elementTestSize = totalTestDf[totalTestDf['MovieId'] == i].shape[0]
    if elementTrainingSize < elementTestSize:
        transferItemSize = ((elementTrainingSize + elementTestSize) // 2) + 1 - elementTrainingSize
        transferItemIndex = totalTestDf[totalTestDf['MovieId'] == i].index[:transferItemSize].tolist()
        removeIndex.extend(transferItemIndex)
        for j in transferItemIndex:
            rowList.append([totalTestDf.iloc[j].UserId, totalTestDf.iloc[j].MovieId, totalTestDf.iloc[j].Category])
totalTestDf.drop(removeIndex, inplace=True)
totalTestDf.reset_index(drop=True, inplace=True)
totalTestDf

Unnamed: 0,UserId,MovieId,Category
0,485,5405,2
1,1971,1043,1
2,5322,7248,1
3,4147,1851,1
4,3755,1457,1
...,...,...,...
250097,767,833,1
250098,4311,322,1
250099,7262,19696,2
250100,146,324,1


In [43]:
len(rowList)

2402

In [44]:
#Adding unrepresented movies to training dataframe 
trainingDf = pd.concat([trainingDf, pd.DataFrame(rowList, columns=['UserId', 'MovieId', 'Category'])], ignore_index=True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,7953,1366,2
1,234,416,2
2,4431,133,2
3,2579,164,1
4,8945,1998,2
...,...,...,...
760794,9999,22005,2
760795,9999,22006,1
760796,4017,22009,0
760797,10037,22020,1


In [45]:
#Shuffle trainingDf
trainingDf = trainingDf.sample(frac=1).reset_index(drop=True)
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,2628,195,2
1,4474,565,2
2,5175,9476,2
3,9268,1308,2
4,9210,117,2
...,...,...,...
760794,3289,1326,2
760795,4730,309,1
760796,3821,15052,2
760797,6935,174,1


In [46]:
#Checking is there at least 1 input for each movie in training data
trainingDf['MovieId'].unique().shape[0]

22033

In [47]:
#totalTest data is splitting as validation data and test data
validationDf, testDf = train_test_split(totalTestDf, test_size=0.5)
validationDf.reset_index(drop=True, inplace = True)
validationDf

Unnamed: 0,UserId,MovieId,Category
0,7160,1089,2
1,3734,466,2
2,8077,315,1
3,8888,5353,1
4,7588,382,1
...,...,...,...
125046,5175,2366,1
125047,4289,246,2
125048,9167,4305,1
125049,490,1128,1


In [48]:
#Resetting Index and Checking test data frame
testDf.reset_index(drop=True, inplace = True)
testDf

Unnamed: 0,UserId,MovieId,Category
0,2462,2321,1
1,6121,2859,1
2,6835,2574,0
3,4628,1167,1
4,8252,2558,1
...,...,...,...
125046,221,1027,1
125047,7845,75,1
125048,8340,1370,2
125049,10072,344,1


In [49]:
#Organized data save as pkl file for future uses
trainingDf.to_pickle("../Data/pkl/1M/Qualified/CategoricalLike/NotCategorizedSample/Training.pkl")
validationDf.to_pickle("../Data/pkl/1M/Qualified/CategoricalLike/NotCategorizedSample/Validation.pkl")
testDf.to_pickle("../Data/pkl/1M/Qualified/CategoricalLike/NotCategorizedSample/Test.pkl")

In [50]:
#Reading Qualified CategoricalLike NotCategorizedSample training data from pkl file
trainingDf = pd.read_pickle("../Data/pkl/1M/Qualified/CategoricalLike/NotCategorizedSample/Training.pkl")
trainingDf

Unnamed: 0,UserId,MovieId,Category
0,2628,195,2
1,4474,565,2
2,5175,9476,2
3,9268,1308,2
4,9210,117,2
...,...,...,...
760794,3289,1326,2
760795,4730,309,1
760796,3821,15052,2
760797,6935,174,1


In [51]:
#Reading Qualified CategoricalLike NotCategorizedSample validation data from pkl file
validationDf = pd.read_pickle("../Data/pkl/1M/Qualified/CategoricalLike/NotCategorizedSample/Validation.pkl")
validationDf

Unnamed: 0,UserId,MovieId,Category
0,7160,1089,2
1,3734,466,2
2,8077,315,1
3,8888,5353,1
4,7588,382,1
...,...,...,...
125046,5175,2366,1
125047,4289,246,2
125048,9167,4305,1
125049,490,1128,1


In [52]:
#Reading Qualified CategoricalLike NotCategorizedSample test data from pkl file
testDf = pd.read_pickle("../Data/pkl/1M/Qualified/CategoricalLike/NotCategorizedSample/Test.pkl")
testDf

Unnamed: 0,UserId,MovieId,Category
0,2462,2321,1
1,6121,2859,1
2,6835,2574,0
3,4628,1167,1
4,8252,2558,1
...,...,...,...
125046,221,1027,1
125047,7845,75,1
125048,8340,1370,2
125049,10072,344,1


In [53]:
#Generating df that include training, test and validation data
df = pd.concat([trainingDf, validationDf, testDf], ignore_index = True)
df

Unnamed: 0,UserId,MovieId,Category
0,2628,195,2
1,4474,565,2
2,5175,9476,2
3,9268,1308,2
4,9210,117,2
...,...,...,...
1010896,221,1027,1
1010897,7845,75,1
1010898,8340,1370,2
1010899,10072,344,1


In [54]:
#Checking is there any nan value in df
df.isnull().values.any()

False

In [55]:
#Check if any duplicate user-movie pairs exist in df
df[['UserId', 'MovieId']].duplicated().any()

False

In [56]:
#Calculating number of users in Qualified Data
df['UserId'].unique().shape[0]

10073

In [57]:
#Calculating number of movies in Qualified Data
df['MovieId'].unique().shape[0]

22033