In [1]:
import io
import os
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import _LRScheduler

In [2]:
plt.style.use('ggplot')

In [3]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)

In [4]:
RANDOM_STATE = 1
set_random_seed(RANDOM_STATE)

In [5]:
def try_download(url, download_path):
    archive_name = url.split('/')[-1]
    folder_name, _ = os.path.splitext(archive_name)
    
    try:
        r = urlopen(url)
    except URLError as e:
        print("Cannot download the data. Error: %s" % s)
        return
    
    assert r.status == 200
    data = r.read()
    
    with zipfile.ZipFile(io.BytesIO(data)) as arch:
        arch.extractall(download_path)
        
    print("The archive is extracted into folder: %s" % download_path)

In [28]:
def read_data(path):
    files = {}
    for filename in path.glob('*'):
        if filename.suffix == '.csv':
            files[filename.stem] = pd.read_csv(filename)
        elif filename.suffix == '.dat':
            if filename.stem == 'ratings':
                columns = ['userId', 'movieId', 'rating', 'timestamp']
            else:
                columns = ['movieId', 'title', 'genres']
            data = pd.read_csv(filename, sep='::', names=columns, engine='python')
            files[filename.stem] = data
    return files['ratings'], files['movies']

In [21]:
download_path = Path.cwd()

In [29]:
ratings, movies = read_data(download_path / 'ml-latest-small')

In [30]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [31]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [32]:
def tabular_preview(ratings, n=15):
    user_groups = ratings.groupby('userId')['rating'].count()
    top_users = user_groups.sort_values(ascending=False)[:15]
    
    movie_groups = ratings.groupby('movieId')['rating'].count()
    top_movies = movie_groups.sort_values(ascending=False)[:15]
    
    top = (ratings.join(top_users, rsuffix='_r', how='inner', on='userId').join(top_movies, rsuffix='_r', how='inner', on='movieId'))
    
    return pd.crosstab(top.userId, top.movieId, top.rating, aggfunc=np.sum)

In [33]:
tabular_preview(ratings, movies)

movieId,1,50,110,260,296,318,356,480,527,589,593,1196,2571,2858,2959
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
68,2.5,3.0,2.5,5.0,2.0,3.0,3.5,3.5,4.0,3.5,3.5,5.0,4.5,5.0,2.5
182,4.0,4.5,3.5,3.5,5.0,4.5,5.0,3.5,4.0,2.0,4.5,3.0,5.0,5.0,5.0
249,4.0,4.0,5.0,5.0,4.0,4.5,4.5,4.0,4.5,4.0,4.0,5.0,5.0,4.5,5.0
274,4.0,4.0,4.5,3.0,5.0,4.5,4.5,3.5,4.0,4.5,4.0,4.5,4.0,5.0,5.0
288,4.5,,5.0,5.0,5.0,5.0,5.0,2.0,5.0,4.0,5.0,4.5,3.0,,3.5
307,4.0,4.5,3.5,3.5,4.5,4.5,4.0,3.5,4.5,2.5,4.5,3.0,3.5,4.0,4.0
380,5.0,4.0,4.0,5.0,5.0,3.0,5.0,5.0,,5.0,5.0,5.0,4.5,,4.0
387,,4.5,3.5,4.5,5.0,3.5,4.0,3.0,,3.5,4.0,4.5,4.0,4.5,4.5
414,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0
448,5.0,4.0,,5.0,5.0,,3.0,3.0,,3.0,5.0,5.0,2.0,4.0,4.0


In [35]:
print(ratings)
print(ratings.groupby('userId'))
print(ratings.groupby('userId')['ratings'])
print(ratings.groupby('userId')['ratings'].count())

        userId  movieId  rating   timestamp
0            1        1     4.0   964982703
1            1        3     4.0   964981247
2            1        6     4.0   964982224
3            1       47     5.0   964983815
4            1       50     5.0   964982931
5            1       70     3.0   964982400
6            1      101     5.0   964980868
7            1      110     4.0   964982176
8            1      151     5.0   964984041
9            1      157     5.0   964984100
10           1      163     5.0   964983650
11           1      216     5.0   964981208
12           1      223     3.0   964980985
13           1      231     5.0   964981179
14           1      235     4.0   964980908
15           1      260     5.0   964981680
16           1      296     3.0   964982967
17           1      316     3.0   964982310
18           1      333     5.0   964981179
19           1      349     4.0   964982563
20           1      356     4.0   964980962
21           1      362     5.0 

KeyError: 'Column not found: ratings'

In [40]:
print(ratings.groupby('userId')['rating'].count())

userId
1       232
2        29
3        39
4       216
5        44
6       314
7       152
8        47
9        46
10      140
11       64
12       32
13       31
14       48
15      135
16       98
17      105
18      502
19      703
20      242
21      443
22      119
23      121
24      110
25       26
26       21
27      135
28      570
29       81
30       34
       ... 
581      40
582      56
583      56
584      83
585      61
586     208
587     165
588      56
589      40
590     728
591      54
592      94
593     103
594     232
595      20
596     411
597     443
598      21
599    2478
600     763
601     101
602     135
603     943
604     100
605     221
606    1115
607     187
608     831
609      37
610    1302
Name: rating, Length: 610, dtype: int64
