In [1]:
import pandas as pd
import numpy as np

# Read data

In [2]:
train_data = pd.read_csv("data/raw/ml-100k/u1.base", sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])
test_data = pd.read_csv("data/raw/ml-100k/u1.test", sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])
all_data = pd.read_csv("data/raw/ml-100k/u.data", sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])


movies = pd.read_csv("data/raw/ml-100k/u.item", sep="|", header=None, encoding='latin-1')
movies.columns = ["movie id", "movie title", "release date", "video release date", "IMDb URL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]


users = pd.read_csv("data/raw/ml-100k/u.user", sep="|", header=None, names=["user_id", "age", "gender", "occupation", "zip_code"])

# Save to csv

In [3]:
train_data.to_csv("data/interim/train_data_row.csv")
test_data.to_csv("data/interim/test_data_row.csv")
all_data.to_csv("data/interim/data.csv")
movies.to_csv("data/interim/movies.csv")
users.to_csv("data/interim/users.csv")

In [4]:
# What's the distribution of highly rated movies?
print("Rating Distribution")
train_data.groupby(['rating'])['rating'].count()

Rating Distribution


rating
1     4719
2     9178
3    21963
4    27396
5    16744
Name: rating, dtype: int64

# Prepare data 
## Combine user info, movie info, rating

In [5]:
from tqdm import tqdm

def prepare_data(data, user_data, movie_data):
    src_data = []
    trg_data = []
    
    urs_len = len(user_data)
    mov_len = len(movie_data)
    
    skip_mov = ["movie title", "release date", "video release date", "IMDb URL", "zip_code"]
    
    for row in tqdm(data.iterrows()):
        user_id = row[1]['user_id']
        movie_id = row[1]['item_id']
        rating = row[1]['rating']
        
        user_row = user_data.loc[user_id - 1] # index start 0
        movie_row = movie_data.loc[movie_id - 1]
        
        assert user_row['user_id'] == user_id, "user id must be the same, but found"
        assert movie_row['movie id'] == movie_id, "movie id must be the same"
        
        src_row = []
        for col in user_row.index.tolist():
            if col in skip_mov:
                continue
            src_row.append(user_row[col])
        
        for col in movie_row.index.tolist():
            if col in skip_mov:
                continue
            
            src_row.append(movie_row[col])
        
        src_data.append(src_row)
        trg_data.append(rating)
    
    return src_data, trg_data

In [6]:
src_d, trg_d = prepare_data(train_data, users, movies)
data_columns = []
skip_mov = ["movie title", "release date", "video release date", "IMDb URL", "zip_code"]
for col in users.columns:
    if col in skip_mov:
        continue
    data_columns.append(col)

for col in movies.columns:
    if col in skip_mov:
        continue
    data_columns.append(col)

data = pd.DataFrame(src_d, columns=data_columns)
data['rating'] = trg_d

80000it [00:15, 5079.30it/s]


In [7]:
data.to_csv("data/interim/train_data.csv")

In [8]:
src_d, trg_d = prepare_data(test_data, users, movies)
data_columns = []
skip_mov = ["movie title", "release date", "video release date", "IMDb URL", "zip_code"]
for col in users.columns:
    if col in skip_mov:
        continue
    data_columns.append(col)

for col in movies.columns:
    if col in skip_mov:
        continue
    data_columns.append(col)

data = pd.DataFrame(src_d, columns=data_columns)
data['rating'] = trg_d

20000it [00:04, 4483.40it/s]


In [9]:
data.to_csv("data/interim/test_data.csv")