In [4]:
import itertools
import pandas as pd
import numpy as np
import random
import csv
import time
import copy
import pandas as pd

import matplotlib.pyplot as plt


class DataPreprocessor():
    def __init__(self, datapath, itempath):
        '''
        Load data from the DB MovieLens
        List the users and the items
        List all the users historic
        '''
        self.data  = self.load_data(datapath, itempath)
        userId = np.array(self.data['userId'].values.tolist()) - 1
        itemId = np.array(self.data['itemId'].values.tolist()) - 1
        self.data['userId'] = list(userId)
        self.data['itemId'] = list(itemId)
        self.users = self.data['userId'].unique()   #list of all users
        self.items = self.data['itemId'].unique()   #list of all items

        #a list contains the rating history of each user
        self.histo = self.gen_histo()


    def load_data(self, datapath, itempath):
        '''
        Load the data and merge the name of each movie.
        A row corresponds to a rate given by a user to a movie.

         Parameters
        ----------
        datapath :  string
                    path to the data 100k MovieLens
                    contains usersId;itemId;rating
        itempath :  string
                    path to the data 100k MovieLens
                    contains itemId;itemName
         Returns
        -------
        result :    DataFrame
                    Contains all the ratings
        '''
        data = pd.read_csv(datapath, sep='\t',
                       names=['userId', 'itemId', 'rating', 'timestamp'])
        movie_titles = pd.read_csv(itempath, sep='|', names=['itemId', 'itemName'],
                           usecols=range(2), encoding='latin-1')
        return data.merge(movie_titles,on='itemId', how='left')


    def gen_histo(self):
        '''
        Group all rates given by users and store them from older to most recent.

        Returns
        -------
        result :    List(DataFrame)
                    List of the historic for each user
        '''
        historic_users = []
        for i, u in enumerate (self.users):
            temp = self.data[self.data['userId'] == u]
            temp = temp.sort_values ('timestamp').reset_index ()
            temp.drop ('index', axis = 1, inplace = True)
            historic_users.append (temp)
        return historic_users

    def write_csv(self, train_test_ratio=0.9, nb_states=5, pivot_rating=4):
        train_data = []
        test_data = []
        
        def sample_histo_v6(user_histo, pivot_rating, nb_states):
            binary_ratings = [1 if i>=pivot_rating else 0 for i in user_histo['rating']]
            user = user_histo['userId'][0]
            items = user_histo['itemId'].values.tolist()
            users = []
            slates = []
            actions = []
            for i in range(0,len(items),nb_states):
                if i+nb_states <= len(items):
                    slate = items[i:i+nb_states]
                    action = binary_ratings[i:i+nb_states]
                    slates.append(slate)
                    actions.append(action)
                    users.append(user)

            user_df = pd.DataFrame()
            user_df['user'] = users
            user_df['slate'] = slates
            user_df['action'] = actions
            return user_df
        
        for user_histo in self.histo: 
            u_df = sample_histo_v6(user_histo, pivot_rating, nb_states)
            split_point = int(train_test_ratio*len(u_df))
            u_train_df = u_df[0:split_point]
            u_test_df = u_df[split_point:]
            train_data.append(u_train_df)
            test_data.append(u_test_df)

        train_df = pd.concat(train_data)
        test_df = pd.concat(test_data)


        train_df.to_csv('./train_data.csv', index=False)
        test_df.to_csv('./test_data.csv', index=False)





In [5]:
import os
dg = DataPreprocessor(os.path.dirname(os.getcwd())+'/data/ml-100k/u.data', os.path.dirname(os.getcwd())+'/data/ml-100k/u.item')

In [6]:
dg.write_csv()

In [7]:
import pandas as pd
import numpy as np
import ast  

def read_file(filename):
    df = pd.read_csv(filename)
    state = [ast.literal_eval(i) for i in df['slate'].values.tolist()]
    user = df['user'].values.tolist()
    history = [ast.literal_eval(i) for i in df['action'].values.tolist()]
    
    data = pd.DataFrame ()
    data['user'] = user
    data['slate'] = state
    data['action'] = history
    return data


In [8]:
data = read_file('./train_data.csv')

In [9]:
data.head()

Unnamed: 0,user,slate,action
0,195,"[241, 268, 285, 305, 339]","[0, 0, 1, 1, 0]"
1,195,"[1021, 250, 256, 1006, 1240]","[1, 0, 0, 1, 0]"
2,195,"[427, 380, 201, 284, 7]","[1, 1, 0, 1, 1]"
3,195,"[115, 654, 110, 152, 172]","[0, 1, 1, 1, 0]"
4,195,"[237, 69, 381, 392, 286]","[1, 0, 1, 1, 0]"
