# Data Compilation

## 1. Generation of the rating dataset

Write Python function rating_compiler to compile from the training_set.tar, the  17770  files and store the result into one data structure named  D

In [0]:
import os

# Looking throught every file in training_set folder 
# and storing file names into a list
file_names = []
for root, dirs, files in os.walk("training_set"):
    for filename in files:
        file_names.append("training_set\\" + filename)

In [0]:
len(file_names)

17770

In [0]:
import pandas as pd
import numpy as np

def add_movie_reviews(file_name):
    """
    Return a dataframe with containing CustumerIDs 
    and their ratings for the file's MovieID.
    
    Parameters: 
    file_name (string): File name from the dataset mv_film_number.

    Returns: 
    df (pd.DataFrame): Dataframe with containing CustumerIDs 
                        and their ratings for the file's MovieID.
    """
    
    f = open(file_name, "r")

    # Read MovieID
    MovieID = int(f.readline().rstrip()[:-1])
    
    # Read Movie's CustumerIDs and their Ratings
    data = []
    for line in f:
        rating = line.split(",")
        rating.insert(0,MovieID)
        rating[1] = int(rating[1])  # Cast CustomerID
        rating[2] = np.int8(rating[2])  # Cast Rating to save memory
        data.append(rating[:-1])  # Skip Date since we don't need it

    # Build the dataframe
    df = pd.DataFrame(data, columns = ['MovieID', 'CustomerID', 'Rating'])
    df.astype({'Rating': 'int8'})
    
    return df

In [0]:
add_movie_reviews(file_names[0]).head()

Unnamed: 0,MovieID,CustomerID,Rating
0,1,1488844,3
1,1,822109,5
2,1,885013,4
3,1,30878,4
4,1,823519,3


In [0]:
def rating_compiler(file_names):
    """
    Compiling all 17770 files into a .csv file.
    
    Parameters: 
    file_names (lsit): List of all file names from the dataset mv_film_number.

    """
    
    # Creating the result dataframe
    D = pd.DataFrame(columns = ['MovieID', 'CustomerID', 'Rating']) 
    
    # Fill the D dataframe
    for movie_file in file_names:
        print("Reading : "+ movie_file[13:], end="\r")  # To keep an eye on the process
        D = D.append(add_movie_reviews(movie_file))
        
    # Saving the D dataframe into a csv file
    D.to_csv("training_set_csv\\D.csv", index=False)  # Without an index to save memory

In [0]:
rating_compiler(file_names)

# This operation was done in another notebook, and it took more that 2 hours 

In [0]:
!dir training_set_csv

 Volume in drive C has no label.
 Volume Serial Number is 36BC-B5A1

 Directory of C:\Users\Oussama\Desktop\Projet Factorisation matricielle\training_set_csv

02/04/2020  01:06 AM    <DIR>          .
02/04/2020  01:06 AM    <DIR>          ..
02/03/2020  11:26 PM     1,601,678,799 D.csv
02/04/2020  01:37 AM     1,579,508,658 R.csv
02/04/2020  01:00 AM        22,170,168 T.csv
               3 File(s)  3,203,357,625 bytes
               2 Dir(s)  11,025,993,728 bytes free


In [0]:
# Read D as a dataframe from the csv file and counting execution time

import datetime
start = datetime.datetime.now()

D_df = pd.read_csv("training_set_csv\\D.csv", 
                    # Otherwise, every field's size would be 803844056 
                    dtype={'Rating': 'uint8', 'CustomerID': 'uint32', 'MovieID': 'uint16'}) 

print("D_df was read in : {}".format(datetime.datetime.now() - start))

D_df.describe()

D_df was read in : 0:00:34.046092


Unnamed: 0,MovieID,CustomerID,Rating
count,100235800.0,100235800.0,100235800.0
mean,9052.846,1322484.0,3.60427
std,5125.086,764538.7,1.085258
min,1.0,6.0,1.0
25%,4670.0,661192.0,3.0
50%,9049.0,1318995.0,4.0
75%,13614.0,1984455.0,4.0
max,17770.0,2649429.0,5.0


In [0]:
D_df.memory_usage(deep=True)

Index                80
MovieID       200471694
CustomerID    400943388
Rating        100235847
dtype: int64

## 2.Generation of training and test datasets

Extract from D the ratings corresponding to the users and movies described in the probe.txt and store the result into one data structure named  T

In [0]:
def create_T_df(D_df):
    """
    Returns T dataframe.
    
    Parameters: 
    D_df (pd.DataFrame): Dataframe generated from the 17770 files.

    Returns: 
    T_df (pd.DataFrame): Dataframe corresponding to the probe.txt file elements.
    """
    
    f = open("probe.txt", "r")
    
    current_movie = int(f.readline().rstrip()[0])
    data = []
    for line in f:
        if ':' in line: #if it's a new movieID
            current_movie = line[:-2]
        else:
            data.append([current_movie, line[:-1]])
    
    # Build a 2 columns T dataframe
    pre_T_df = pd.DataFrame(data, columns = ['MovieID', 'CustomerID']).astype({'CustomerID': 'uint32', 'MovieID': 'uint16'})
    
    # Join D_df and pre_T_df on MovieID and CustomerID to get the ratings
    # corresponding to the users and movies described in probe.txt and return
    T_df = pd.merge(D_df, pre_T_df, how='inner', on=['MovieID', 'CustomerID'])
    
    return T_df

In [0]:
# Building T as a dataframe and counting execution time
start = datetime.datetime.now()

T_df = create_T_df(D_df)

print("T_df was read in : {}".format(datetime.datetime.now() - start))

T_df.describe()

T_df was read in : 0:01:00.002281


Unnamed: 0,MovieID,CustomerID,Rating
count,1406267.0,1406267.0,1406267.0
mean,7007.113,1323325.0,3.673713
std,5252.483,765314.6,1.127476
min,1.0,6.0,1.0
25%,2342.0,659240.0,3.0
50%,5777.0,1321746.0,4.0
75%,11315.0,1986276.0,5.0
max,17770.0,2649429.0,5.0


In [0]:
T_df.memory_usage(deep=True)

Index         11250136
MovieID        2812534
CustomerID     5625068
Rating         1406267
dtype: int64

In [0]:
# Saving the T dataframe into a csv file
T_df.to_csv("training_set_csv\\T.csv", index=False) #Without an index to save memory

In [0]:
# Read T as a dataframe from the csv file and counting execution time
import datetime
start = datetime.datetime.now()

T_df = pd.read_csv("training_set_csv\\T.csv", 
                    #Otherwise, every field's size would be 803844056 
                    dtype={'Rating': 'uint8', 'CustomerID': 'uint32', 'MovieID': 'uint16'}) 

print("D_df was read in : {}".format(datetime.datetime.now() - start))

T_df.memory_usage(deep=True)

D_df was read in : 0:00:00.747039


Index              80
MovieID       2812534
CustomerID    5625068
Rating        1406267
dtype: int64

Construct training dataset  R  as  D  from which we remove entries present in  T

In [0]:
def create_R_dataframe(D_df, T_df):
    """
    Returns the R dataframe.
    
    Parameters: 
    D_df (pd.DataFrame): Dataframe generated from the 17770 files.
    T_df (pd.DataFrame): Dataframe corresponding to the probe.txt file elements.

    Returns: 
    R_df (pd.DataFrame): Dataframe corresponding elements in D_df that doesn't 
                         belong to T_df.
    """

    R_df = pd.concat([D_df, T_df, T_df]).drop_duplicates(keep=False)
    R_df = R_df.astype({'Rating': 'uint8', 'CustomerID': 'uint32', 'MovieID': 'uint16'})
    
    return R_df

In [0]:
# Building R as a dataframe and counting execution time
start = datetime.datetime.now()

R_df = create_R_dataframe(D_df, T_df)

print("R_df was read in : {}".format(datetime.datetime.now() - start))

R_df.memory_usage(deep=True)

R_df was read in : 0:01:12.387394


Index         790636640
MovieID       197659160
CustomerID    395318320
Rating         98829580
dtype: int64

In [0]:
# Saving the R dataframe into a csv file
R_df.to_csv("training_set_csv\\R.csv", index=False) #Without an index to save memory

In [0]:
# Read R as a dataframe from the csv file and counting execution time
import datetime
start = datetime.datetime.now()

R_df = pd.read_csv("training_set_csv\\R.csv", 
                    #Otherwise, every field's size would be 803844056 
                    dtype={'Rating': 'uint8', 'CustomerID': 'uint32', 'MovieID': 'uint16'}) 

print("D_df was read in : {}".format(datetime.datetime.now() - start))

R_df.memory_usage(deep=True)

D_df was read in : 0:00:41.402742


Index                80
MovieID       197659160
CustomerID    395318320
Rating         98829580
dtype: int64

D , T and R must all have the same format (for example users as rows and films as columns) 

In [0]:
from scipy.sparse import coo_matrix

def dataframe_to_matrix(df):
    """ Convert a dataframe into a Customer-Movie matrix.
    
    Parameters: 
    df (pd.DataFrame): Dataframe in the format [MovieID, CustomerID, Rating].

    Returns: 
    M (coo_matrix): Sparse Customer-Movie matrix. 
    """
    
    coo_val = np.array(df['Rating'].tolist(), dtype=np.float16)
    coo_col = np.array(df['MovieID'].tolist(), dtype=np.int16) - 1  # Indices should start from 0
    coo_row = np.array(df['CustomerID'].tolist())
    _, indices = np.unique(coo_row, return_inverse=True)

    M = coo_matrix((coo_val, (indices, coo_col)))
    
    return M

In [0]:
start = datetime.datetime.now()

D = dataframe_to_matrix(D_df)

print("D was read in : {}".format(datetime.datetime.now() - start))

D was read in : 0:01:50.913945


In [0]:
print("D sparce-matrix size : {} MB".format(D.data.nbytes//(1024*1024)))

D sparce-matrix size : 191 MB


In [0]:
start = datetime.datetime.now()

T = dataframe_to_matrix(T_df)

print("T was read in : {}".format(datetime.datetime.now() - start))

T was read in : 0:00:01.645597


In [0]:
print("T sparce-matrix size : {} MB".format(T.data.nbytes//(1024*1024)))

T sparce-matrix size : 2 MB


In [0]:
start = datetime.datetime.now()

R = dataframe_to_matrix(R_df)

print("R was read in : {}".format(datetime.datetime.now() - start))

R was read in : 0:01:36.340841


In [0]:
print("R sparce-matrix size : {} MB".format(R.data.nbytes//(1024*1024)))

R sparce-matrix size : 188 MB


In [0]:
from scipy.sparse import save_npz, load_npz, coo_matrix

def save_matrix(M, M_name):
    """ Save sparce matrix M to be used later.

    Parameters:
        M_name (string): Matrix name as saved in drive (R, T or D).
        M (coo_matrix): Matrix to be saved.

    """

    save_npz("training_set_csv\\"+M_name+".npz", M)

In [0]:
from scipy.sparse import save_npz, load_npz, coo_matrix

def load_matrix(M_name):
    """ Load previously saved sparce matrix M.

    Parameters:
        M_name (string): Matrix name as saved in drive (R, T or D).

    Returns:
        M (coo_matrix): Saved matrix.

    """

    M = load_npz("training_set_csv\\"+M_name+".npz")

    return M

In [0]:
start = datetime.datetime.now()

save_matrix(D, "D")

print("D was saved in : {}".format(datetime.datetime.now() - start))

D was saved in : 0:01:29.398792


In [0]:
start = datetime.datetime.now()

save_matrix(T, "T")

print("T was saved in : {}".format(datetime.datetime.now() - start))

T was saved in : 0:00:01.273700


In [0]:
start = datetime.datetime.now()

save_matrix(R, "R")

print("R was saved in : {}".format(datetime.datetime.now() - start))

In [0]:
start = datetime.datetime.now()

D = load_matrix("D")

print("D was loaded in : {}".format(datetime.datetime.now() - start))

D was loaded in : 0:00:11.771217


In [0]:
start = datetime.datetime.now()

T = load_matrix("T")

print("T was loaded in : {}".format(datetime.datetime.now() - start))

T was loaded in : 0:00:00.135644


In [0]:
start = datetime.datetime.now()

R = load_matrix("R")

print("R was loaded in : {}".format(datetime.datetime.now() - start))

R was loaded in : 0:00:09.765261
