## Transform

We need to do a number of operations on the data before running the algorithms. They are :-
* Data_Preprocessing
* Data_Splitting

#### Data_Preprocessing

* Users: Convert to label encoded values
* Items:  Convert to label encoded values
* User Side Features: Convert to
* Ratings: Convert the interaction to Explicit or Implicit signals


In [84]:
import sys
sys.path.append("Documents/Recommender System/")

In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [86]:
users = pd.read_csv("data/users.csv")
items = pd.read_csv("data/items.csv")
ratings = pd.read_csv("data/ratings.csv")

In [87]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [88]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [89]:
items.columns

Index(['movie_id', 'movie_title', 'release_date', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-fi', 'Thriller', 'War', 'Western', 'year',
       'overview', 'original_language', 'runtime', 'vote_average',
       'vote_count'],
      dtype='object')

## Processing

In [116]:
from reco.preprocess import encode_user_item

In [117]:
ratings['rating'] = ratings['rating'].values.astype(np.float32)
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])

In [118]:
from reco.preprocess import random_split

In [119]:
encoded_ratings, user_encoder, item_encoder = encode_user_item(ratings, "user_id", "movie_id","rating","unix_timestamp")

Number of users:  943
Number of items:  1682


## Random Split

In [99]:
from reco.preprocess import random_split, user_split, sample_data

In [100]:
sampledf = sample_data()

In [101]:
train_random, val_random, test_random = random_split(sampledf, [0.6, 0.2, 0.2])

In [102]:
train_random

Unnamed: 0,user_index,item_index,rating,timestamp,split_index
0,1,1,4,2000-01-01,0
1,1,1,4,2000-01-01,0
2,1,2,3,2000-01-02,0
3,1,2,3,2000-01-02,0
4,1,2,3,2000-01-02,0
5,2,1,4,2000-01-01,0
6,2,2,5,2000-01-01,0
7,2,1,4,2000-01-03,0
8,2,2,5,2000-01-03,0


In [103]:
val_random

Unnamed: 0,user_index,item_index,rating,timestamp,split_index
9,2,3,5,2000-01-03,1
10,3,3,5,2000-01-01,1
11,3,3,5,2000-01-03,1


In [104]:
test_random

Unnamed: 0,user_index,item_index,rating,timestamp,split_index
12,3,3,5,2000-01-03,2
13,3,3,5,2000-01-03,2
14,3,1,4,2000-01-04,2


## User Chronological Split

In [105]:
def user_split(df,ratios,chrono=False):
    seed=42
    samples = df.shape[0]
    col_time="timestamp"
    col_user="user_index"
    splits=[]
    if chrono==True:
        df_grouped=df.sort_values(col_time).groupby(col_user)
    else:
        df_grouped=df.groupby(col_user)
    for name, group in df_grouped:
    
        group_splits = random_split(df_grouped.get_group(name), ratios, shuffle=False)
        
        # Concatenate the list of split dataframes.
        concat_group_splits = pd.concat(group_splits)
        splits.append(concat_group_splits)
    # Concatenate splits for all the groups together.
    splits_all = pd.concat(splits)

    # Take split by split_index
    splits_list = [ splits_all[splits_all["split_index"] == x] for x in range(len(ratios))]
    return splits_list


In [106]:
train_chrono, val_chrono, test_chrono = user_split(sampledf, [0.6, 0.2, 0.2])

In [107]:
train_chrono

Unnamed: 0,user_index,item_index,rating,timestamp,split_index
0,1,1,4,2000-01-01,0
1,1,1,4,2000-01-01,0
2,1,2,3,2000-01-02,0
5,2,1,4,2000-01-01,0
6,2,2,5,2000-01-01,0
7,2,1,4,2000-01-03,0
10,3,3,5,2000-01-01,0
11,3,3,5,2000-01-03,0
12,3,3,5,2000-01-03,0


In [108]:
val_chrono

Unnamed: 0,user_index,item_index,rating,timestamp,split_index
3,1,2,3,2000-01-02,1
8,2,2,5,2000-01-03,1
13,3,3,5,2000-01-03,1


In [109]:
test_chrono


Unnamed: 0,user_index,item_index,rating,timestamp,split_index
4,1,2,3,2000-01-02,2
9,2,3,5,2000-01-03,2
14,3,1,4,2000-01-04,2


In [110]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
...,...,...,...,...
99995,880,476,3.0,880175444
99996,716,204,5.0,879795543
99997,276,1090,1.0,874795795
99998,13,225,2.0,882399156
