In [23]:
import os
import pandas as pd

In [24]:
file_path = os.path.join('.', 'Data')
ratings_file = 'ratings.dat'
users_file = 'users.dat'
movies_file = 'movies.dat'

In [25]:
users_csv = 'users.csv'
ratings_csv = 'ratings.csv'
movies_csv = 'movies.csv'

In [26]:
ratings = pd.read_csv(os.path.join(file_path, ratings_file), 
                     sep = '::', engine = 'python', 
                     names = ['user_id', 'movie_id', 'rating', 'time_stamp'])

In [27]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,time_stamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [28]:
max_user_id = ratings['user_id'].drop_duplicates().max()

max_movie_id = ratings['movie_id'].drop_duplicates().max()

## Preprocessing for Deep learning model

ratings.loc[:, 'user_embed_id'] = ratings['user_id'] - 1
ratings.loc[:, 'movie_embed_id'] = ratings['movie_id'] - 1

In [29]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,time_stamp,user_embed_id,movie_embed_id
0,1,1193,5,978300760,0,1192
1,1,661,3,978302109,0,660
2,1,914,3,978301968,0,913
3,1,3408,4,978300275,0,3407
4,1,2355,5,978824291,0,2354


In [30]:
ratings.shape

(1000209, 6)

In [32]:
ratings.to_csv(os.path.join(file_path, ratings_csv), 
              sep = '\t', 
              encoding = 'latin-1', header = True, index = False, 
              columns = ['user_id', 'movie_id', 'rating', 'time_stamp', 'user_embed_id', 'movie_embed_id'])

In [33]:
age_dict = {1 : 'Under 18', 
            18 : '18-24', 
           25 : '25-34', 
           35 : '35-44', 
           45 : '45-49', 
           50 : '50-55', 
           56 : 'Above 56'}

occupation_dict = {0 : 'Other or not specified', 
                  1 : 'Academic/Educator', 
                  2 : 'Artist', 
                  3 : 'Cleric/Admin', 
                  4 : 'College/Grad student', 
                  5 : 'Customer Service', 
                  6 : 'Doctor/Health care', 
                  7 : 'Executive/Manager', 
                  8 : 'Farmer', 
                  9 : 'Homemaker', 
                  10 : 'K-12 student', 
                  11 : 'Lawyer', 
                  12 : 'Programmer', 
                  13 : 'Retired', 
                  14 : 'Sales/Marketing', 
                  15 : 'Scientist', 
                  16 : 'Self-employed', 
                  17 : 'Technician/Engineer', 
                  18 : 'Tradesman/Craftsman', 
                  19 : 'Unemployed', 
                  20 : 'Writer'}

In [34]:
users = pd.read_csv(os.path.join(file_path, users_file), 
                   engine = 'python', sep = '::', 
                   encoding = 'latin-1', names = ['user_id', 'gender', 'age', 'occupation', 'zipcode'])

In [35]:
users.loc[:, 'age_range'] = users['age'].apply(lambda x : age_dict.get(x, x))
users.loc[:, 'occupation_decoded'] = users['occupation'].apply(lambda x : occupation_dict.get(x, x))

In [36]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_range,occupation_decoded
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,Above 56,Self-employed
2,3,M,25,15,55117,25-34,Scientist
3,4,M,45,7,2460,45-49,Executive/Manager
4,5,M,25,20,55455,25-34,Writer


In [37]:
users.shape

(6040, 7)

In [39]:
users.to_csv(os.path.join(file_path, users_csv), 
            sep = '\t', header = True, 
            encoding = 'latin-1', 
            index = False,
            columns = ['user_id', 'gender', 'age', 'occupation', 'zipcode', 'age_range', 'occupation_decoded'])

In [40]:
movies = pd.read_csv(os.path.join(file_path, movies_file), 
                    sep = '::', engine = 'python', 
                    encoding = 'latin-1', 
                    names = ['movie_id', 'title', 'genres'])

In [41]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
movies.shape

(3883, 3)

In [43]:
movies.loc[:, 'movie_embed_id'] = movies['movie_id'] - 1

In [44]:
movies.head()

Unnamed: 0,movie_id,title,genres,movie_embed_id
0,1,Toy Story (1995),Animation|Children's|Comedy,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [45]:
movies.to_csv(os.path.join(file_path, movies_csv), 
             sep = '\t', header = True, 
             encoding = 'latin-1', 
             index = False, 
             columns = ['movie_id', 'title', 'genres', 'movie_embed_id'])