## Data processing
This notebook is to converting .dat files to .csv files with model required format.

In [1]:
# Import packages
import os
import pandas as pd


In [2]:

# Define file directories
MOVIELENS_DIR = 'dat'
USER_DATA_FILE = 'users.dat'
MOVIE_DATA_FILE = 'movies.dat'
RATING_DATA_FILE = 'ratings.dat'
TAGS_DATA_FILE = 'tags.dat'

In [3]:
# Specify User's Age and Occupation Column
AGES = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
OCCUPATIONS = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }


In [3]:

# Define csv files to be saved into
USERS_CSV_FILE = 'users.csv'
MOVIES_CSV_FILE = 'movies.csv'
RATINGS_CSV_FILE = 'ratings.csv'
TAGS_CSV_FILE = 'tags.csv'

In [17]:
# Read the Ratings File
ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userId', 'movieId', 'rating', 'timestamp'])

# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['userId'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movieId'].drop_duplicates().max()

# Process ratings dataframe for Keras Deep Learning model
# Add user_emb_id column whose values == user_id - 1
ratings['user_emb_id'] = ratings['userId'] - 1
# Add movie_emb_id column whose values == movie_id - 1
ratings['movie_emb_id'] = ratings['movieId'] - 1

print(len(ratings), 'ratings loaded')

10000054 ratings loaded


In [18]:

# Save into ratings.csv
ratings.to_csv(RATINGS_CSV_FILE, 
               sep=',', 
               header=True, 
               encoding='latin-1', 
               columns=['userId', 'movieId', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])
print('Saved to', RATINGS_CSV_FILE)

Saved to ratings.csv


In [19]:
# Read the Movies File
movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movieId', 'title', 'genres'])
print(len(movies), 'descriptions of', max_movieid, 'movies loaded.')

10681 descriptions of 65133 movies loaded.


In [20]:
# Save into movies.csv
movies.to_csv(MOVIES_CSV_FILE, 
              sep=',', 
              header=True, 
              columns=['movieId', 'title', 'genres'])
print('Saved to', MOVIES_CSV_FILE)


Saved to movies.csv


In [21]:
# Read the tag File
tags = pd.read_csv(os.path.join(MOVIELENS_DIR, TAGS_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userId', 'movieId', 'tags', 'timestamp'])



In [22]:
# Save into movies.csv
tags.to_csv(TAGS_CSV_FILE, 
              sep=',', 
              header=True, 
              columns=['userId', 'movieId', 'tags', 'timestamp'])
print('Saved to', TAGS_CSV_FILE)

Saved to tags.csv
