In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.sparse import csr_matrix

In [2]:
BYTES_TO_MB_DIV = 0.000001
def mem_usage_df(df):
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

In [3]:
# Read in the data
anime_list = pd.read_csv('anime.csv')
rating_list = pd.read_csv('rating.csv')
# Drop all un-watched/un-rated anime :(
anime_list.dropna(inplace=True)
# Drop users with less than 50 ratings
rating_list = rating_list.groupby('user_id').filter(lambda x: len(x) >= 50)

# New dataframe with users as rows and anime as columns, with ratings as values
anime_matrix = rating_list.pivot_table(index='user_id', columns='anime_id', values='rating').fillna(0)
anime_matrix.head()
# Print memory usage
mem_usage_df(anime_matrix)
# Print shape
print(anime_matrix.shape)
# Convert to sparse pandas dataframe
anime_matrix_sparse = anime_matrix.astype(pd.SparseDtype("float", 0))
# Print memory usage
mem_usage_df(anime_matrix_sparse)

Memory usage is 3535.522 MB
(39466, 11197)
Memory usage is 86.311 MB
