In [101]:
import requests
import zipfile 
import io
import os
import gzip
import urllib.request
from pprint import pprint

import numpy as np
from scipy import stats
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

# Download Datasets

In [3]:
movielens1m_zip_file_url = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
movielens25m_zip_file_url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
steam_gzip_file_url = "http://cseweb.ucsd.edu/~wckang/steam_reviews.json.gz"
videogames_2018_file_url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games.csv"
beauty_2018_file_url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv"
videogames_2014_file_url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Video_Games.csv"
beauty_2014_file_url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Beauty.csv"

In [4]:
save_to_path = "../raw_data"
os.makedirs(save_to_path, exist_ok=True) 

In [5]:
def save_csv(file_url, save_to_path="./data/", name='dataset'):
    try:
        df = pd.read_csv(file_url)
        write_path = os.path.join(save_to_path, name+".csv")
        df.to_csv(write_path)
        return write_path
    except Exception as e:
        print(e)

In [6]:
def save_zip(zip_file_url, save_to_path="./data/", name='dataset'):
    try:
        r = requests.get(zip_file_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        write_path = os.path.join(save_to_path)
        z.extractall(write_path)
        return write_path
    except Exception as e:
        print(e)

In [7]:
def save_gz(gz_file_url, save_to_path="./data/", name='dataset'):
    """
    Adapted from https://stackoverflow.com/a/61195974/9872987
    """
    # Download archive
    try:
        # Read the file inside the .gz archive located at url
        with urllib.request.urlopen(gz_file_url) as response:
            with gzip.GzipFile(fileobj=response) as uncompressed:
                file_content = uncompressed.read()
        # write to file in binary mode 'wb'
        write_path = os.path.join(save_to_path, name + ".txt")
        with open(write_path, 'wb') as f:
            f.write(file_content)
            return write_path

    except Exception as e:
        print(e)
        return 1

In [8]:
video_games_2018_path = save_csv(videogames_2018_file_url, save_to_path, 'video_games_2018_raw')

In [9]:
video_games_2014_path = save_csv(videogames_2014_file_url, save_to_path, 'video_games_2014_raw')

In [10]:
beauty_2014_path = save_csv(beauty_2014_file_url, save_to_path, 'beauty_2014_raw')

In [11]:
beauty_2018_path = save_csv(beauty_2018_file_url, save_to_path, 'beauty_2018_raw')

In [12]:
ml1m_path = save_zip(movielens1m_zip_file_url, save_to_path)

In [13]:
ml25m_path = save_zip(movielens25m_zip_file_url, save_to_path)

In [14]:
steam_path = save_gz(steam_gzip_file_url, save_to_path, 'steam_raw')

# Pull in raw data

## Read ML-1M

In [15]:
ml1m_df = pd.read_csv(
    os.path.join(ml1m_path, "ml-1m/ratings.dat"),
    names=["user", "item", "rating", "timestamp"],
    sep="::", 
    engine='python'
)

## Read Video 2018

In [56]:
video_2018_df = pd.read_csv(
    videogames_2018_file_url, 
    names=["item", "user", "rating", "timestamp"]
)

## Read Video 2014

In [17]:
video_2014_df = pd.read_csv(
    videogames_2014_file_url, 
    names=["user", "item", "rating", "timestamp"]
)

## Read Beauty 2014

In [18]:
beauty_2014_df = pd.read_csv(
    beauty_2014_file_url, 
    names=["user", "item", "rating", "timestamp"]
)

## Read Beauty 2018

In [57]:
beauty_2018_df = pd.read_csv(
    beauty_2018_file_url, 
    names=["item", "user", "rating", "timestamp"]
)

## Read Steam

In [30]:
def load_steam(path_to_steam):
    """
    Load steam dataset and standardize format
    
    This will take some time!
    """
    data = []
    with open(path_to_steam) as f:
        for i, line in tqdm(enumerate(f)):
            line = eval(line)
            data.append(line)
    steam_df = pd.DataFrame(data)    
    return steam_df

In [31]:
steam_df = load_steam(steam_path)

|          | 0/? [00:00<?, ?it/s]

In [32]:
def preprocess_steam(frame, uid_col="username"):
    """
    
    """
    if uid_col == "user_id":
        # Fill NAN values of user_id with empty string
        frame.user_id.fillna("", inplace=True)

        # Create unique user identifier
        frame['unique_username'] = frame[["username", "user_id"]].apply(lambda x: "_".join(x), axis=1)

        # Standardize column names and keep just the columns we care about
        frame = frame[['unique_username', 'product_id', 'date']].rename(
            columns={"unique_username":"user", "product_id": "item", "date":"timestamp"}).copy()
    elif uid_col == "username":
        frame = frame[['username', 'product_id', 'date']].rename(
            columns={"username":"user", "product_id": "item", "date":"timestamp"}).copy()
    else:
        print("Not a good uid col")
    return frame

In [33]:
steam_df = preprocess_steam(steam_df)

In [102]:
steam_df.head()

Unnamed: 0,user,item,timestamp
0,Chaos Syren,725280,2017-12-17
1,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,2017-12-27
2,hello?<,328100,2017-10-16
3,Cyderine916,35140,2018-01-04
4,DarklyThinking,35140,2018-01-04


# Processing

In [36]:
def filter_by_interactions(frame, min_i=5, user_col='user', item_col='item'):
    
    keep_users = set(frame[frame[user_col].map(
        frame[user_col].value_counts()) >= min_i][user_col].unique())
    
    keep_items = set(frame[frame[item_col].map(
        frame[item_col].value_counts()) >= min_i][item_col].unique())
    
    frame = frame[frame[user_col].isin(keep_users) & frame[item_col].isin(keep_items)]
    return frame

In [38]:
def n_core_frame(frame, n=5, verbose=False):
    i = 1
    while frame.user.value_counts().min() <= 5 or frame.item.value_counts().min() <= 5:
        i += 1
        if verbose:
            print("Iter: ", i)
        frame = filter_by_interactions(frame)
        if verbose:
            print("\n")
            print("users: ", frame.user.nunique())
            print("items: ", frame.item.nunique())
            print("min user:", frame.user.value_counts().min())
            print("min item:", frame.item.value_counts().min())
        if frame.user.value_counts().min() >= 5 and frame.item.value_counts().min() >= 5:
            break            
    return frame

In [40]:
def dataset_stats(frame):
    mean_items_per_user = frame.groupby(by='user').count().item.mean()
    median_items_per_user = frame.groupby(by='user').count().item.median()
    mean_user_per_items = frame.groupby(by='item').count().user.mean()
    density = np.format_float_scientific(len(frame) / (frame.user.nunique() * frame.item.nunique()), 2)
    stats_ = {
        "num_interactions": len(frame),
        "num_users": frame.user.nunique(),
        "num_items": frame.item.nunique(),
        "mean_items_per_user": round(mean_items_per_user, 3),
        "mean_user_per_items": round(mean_user_per_items, 3),
        "density": density,
        "median_items_per_user":median_items_per_user
    }
    
    return stats_

In [41]:
def process_interactions(frame):
    frame = frame.copy()
    frame.drop_duplicates(subset=['user', 'item', 'timestamp'], inplace=True)
    frame.sort_values(by=['user', 'timestamp'], inplace=True)
    cored_frame = n_core_frame(frame, n=5)
    return cored_frame

## Process ML-1M

In [43]:
cored_ml1m_frame = process_interactions(ml1m_df)

In [44]:
cored_ml1m_frame.head()

Unnamed: 0,user,item,rating,timestamp
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103


## Process Steam

In [45]:
cored_steam_frame = process_interactions(steam_df)

In [49]:
cored_steam_frame.head()

Unnamed: 0,user,item,timestamp
2697132,!,421670,2016-03-04
2371759,!,377160,2016-03-31
436692,!,227940,2016-07-21
499895,!,304240,2016-09-15
2211182,!,550650,2017-10-25


## Process Video 2018

In [58]:
cored_video2018_frame = process_interactions(video_2018_df)

In [59]:
cored_video2018_frame.head()

Unnamed: 0,item,user,rating,timestamp
1161014,B00A878J5I,A0220159ZRNBTRKLG08H,5.0,1419897600
1652389,B00LSBNSJA,A0220159ZRNBTRKLG08H,5.0,1420329600
144283,B000084318,A0220159ZRNBTRKLG08H,5.0,1428710400
450508,B000X37732,A0220159ZRNBTRKLG08H,5.0,1428710400
904008,B004WLRQMI,A0220159ZRNBTRKLG08H,3.0,1428710400


## Process Video 2014

In [50]:
cored_video2014_frame = process_interactions(video_2014_df)

In [51]:
cored_video2014_frame.head()

Unnamed: 0,user,item,rating,timestamp
1110943,A00263941WP7WCIL7AKWL,B008OSDHD2,5.0,1352160000
1114365,A00263941WP7WCIL7AKWL,B008SBZF4Y,5.0,1352160000
1111186,A00263941WP7WCIL7AKWL,B008OSDHZK,5.0,1352246400
1110885,A00263941WP7WCIL7AKWL,B008OSDGV0,5.0,1353715200
1114450,A00263941WP7WCIL7AKWL,B008SBZF5S,5.0,1353715200


## Process Beauty 2018

In [60]:
cored_beauty2018_frame = process_interactions(beauty_2018_df)

In [61]:
cored_beauty2018_frame.head()

Unnamed: 0,item,user,rating,timestamp
9872,B00006L9LC,A1118RD3AJD5KH,5.0,1524614400
49455,B000URXP6E,A1118RD3AJD5KH,5.0,1524614400
56255,B0012Y0ZG2,A1118RD3AJD5KH,5.0,1524614400
80179,B001OHV1H4,A1118RD3AJD5KH,5.0,1524614400
324816,B00VG1AV5Q,A1118RD3AJD5KH,5.0,1524614400


## Process Beauty 2014

In [53]:
cored_beauty2014_frame = process_interactions(beauty_2014_df)

In [55]:
cored_beauty2014_frame.head()

Unnamed: 0,user,item,rating,timestamp
1581545,A00414041RD0BXM6WK0GX,B007IY97U0,3.0,1405296000
1643683,A00414041RD0BXM6WK0GX,B00870XLDS,2.0,1405296000
1681280,A00414041RD0BXM6WK0GX,B008MIRO88,1.0,1405296000
1853091,A00414041RD0BXM6WK0GX,B00BQYYMN0,3.0,1405296000
1975026,A00414041RD0BXM6WK0GX,B00GRTQBTM,5.0,1405296000


# Dataset Stats

In [62]:
def dataset_stats(frame):
    mean_items_per_user = frame.groupby(by='user').count().item.mean()
    median_items_per_user = frame.groupby(by='user').count().item.median()
    mean_user_per_items = frame.groupby(by='item').count().user.mean()
    density = np.format_float_scientific(len(frame) / (frame.user.nunique() * frame.item.nunique()), 2)
    stats_ = {
        "num_interactions": len(frame),
        "num_users": frame.user.nunique(),
        "num_items": frame.item.nunique(),
        "mean_items_per_user": round(mean_items_per_user, 3),
        "mean_user_per_items": round(mean_user_per_items, 3),
        "density": density,
        "median_items_per_user":median_items_per_user
    }
    
    return stats_

In [64]:
cored_frames = {
    "beauty_2014": cored_beauty2014_frame, 
    "beauty_2018": cored_beauty2018_frame, 
    "ml-1m": cored_ml1m_frame, 
    "steam": cored_steam_frame, 
    "video_2014": cored_video2014_frame, 
    "video_2018": cored_video2018_frame
}

In [70]:
for name, frame in cored_frames.items():
    ds_stats = dataset_stats(frame)
    print("\n")
    print(name)
    pprint(ds_stats)



beauty_2014
{'density': '7.34e-04',
 'mean_items_per_user': 8.876,
 'mean_user_per_items': 16.404,
 'median_items_per_user': 6.0,
 'num_interactions': 198502,
 'num_items': 12101,
 'num_users': 22363}


beauty_2018
{'density': '1.82e-01',
 'mean_items_per_user': 5.086,
 'mean_user_per_items': 27.429,
 'median_items_per_user': 5.0,
 'num_interactions': 768,
 'num_items': 28,
 'num_users': 151}


ml1m
{'density': '4.84e-02',
 'mean_items_per_user': 165.499,
 'mean_user_per_items': 292.626,
 'median_items_per_user': 96.0,
 'num_interactions': 999611,
 'num_items': 3416,
 'num_users': 6040}


steam
{'density': '1.06e-03',
 'mean_items_per_user': 12.632,
 'mean_user_per_items': 297.239,
 'median_items_per_user': 8.0,
 'num_interactions': 3555275,
 'num_items': 11961,
 'num_users': 281460}


video_2014
{'density': '8.94e-04',
 'mean_items_per_user': 9.537,
 'mean_user_per_items': 21.719,
 'median_items_per_user': 7.0,
 'num_interactions': 231780,
 'num_items': 10672,
 'num_users': 24303}



# Interval Stats

In [None]:
from scipy import stats

In [91]:
def interval_stats(time_list):        
    
    intervals = [(time_list[i+1] - time_list[i]).days for i in range(len(time_list)-1)]    
    mode, mode_count = stats.mode(intervals)
    mean = np.round(np.mean(intervals), 2)
    countx = len(time_list)
    countx_unique = len(set([x.date() for x in time_list]))
    
    outs = {
        "num_ratings": countx, 
        "unique_days": countx_unique, 
        "mean_interval": mean,
        "mode_interval": mode[0],
        "mode_interval_count": mode_count[0],
        "intervals": intervals
    }
        
    return outs

def interval_stats_frame(frame, datetime_units=None):
    if datetime_units:
        frame['datetime'] = pd.to_datetime(frame.timestamp, unit=datetime_units)
    else:
        frame['datetime'] = pd.to_datetime(frame.timestamp)
    user_timelists = frame.groupby('user').progress_apply(lambda x: x.datetime.tolist())
    user_timelist_df = user_timelists.reset_index(name='timelists')
    
    interval_stats_df = pd.concat([
        user_timelist_df, 
        pd.DataFrame(list(user_timelist_df.timelists.progress_apply(interval_stats)))], 
        axis=1)
    
    interval_stats_df["unique_rating_ratio"] = interval_stats_df.unique_days / interval_stats_df.num_ratings
    
    return interval_stats_df

In [95]:
for name, frame in cored_frames.items():
    print("\n")
    print(name)
    if name in ["steam"]:
        interval_stats_out = interval_stats_frame(frame)
    else:
        interval_stats_out = interval_stats_frame(frame, datetime_units="s")     
    pprint(interval_stats_out.describe())



beauty_2014


  0%|          | 0/22363 [00:00<?, ?it/s]

  0%|          | 0/22363 [00:00<?, ?it/s]

        num_ratings   unique_days  mean_interval_days  mode_interval_days  \
count  22363.000000  22363.000000        22363.000000        22363.000000   
mean       8.876358      5.415195           83.775561            4.032017   
std        8.163819      5.729394           96.473753           19.460027   
min        5.000000      1.000000            0.000000            0.000000   
25%        5.000000      3.000000           16.320000            0.000000   
50%        6.000000      4.000000           53.430000            0.000000   
75%        9.000000      6.000000          115.250000            0.000000   
max      204.000000    151.000000          909.250000          744.000000   

       mode_interval_count  unique_rating_ratio  
count         22363.000000         22363.000000  
mean              3.621965             0.610316  
std               4.138630             0.255771  
min               1.000000             0.041667  
25%               1.000000             0.400000  
50%   

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

       num_ratings  unique_days  mean_interval_days  mode_interval_days  \
count   151.000000   151.000000          151.000000          151.000000   
mean      5.086093     1.218543            4.097748            0.211921   
std       0.461019     0.807416           20.519423            2.168284   
min       5.000000     1.000000            0.000000            0.000000   
25%       5.000000     1.000000            0.000000            0.000000   
50%       5.000000     1.000000            0.000000            0.000000   
75%       5.000000     1.000000            0.000000            0.000000   
max       8.000000     7.000000          191.750000           26.000000   

       mode_interval_count  unique_rating_ratio  
count           151.000000           151.000000  
mean              3.880795             0.237196  
std               0.711123             0.141650  
min               1.000000             0.166667  
25%               4.000000             0.200000  
50%               4.0000

  0%|          | 0/6040 [00:00<?, ?it/s]

  0%|          | 0/6040 [00:00<?, ?it/s]

              user  num_ratings  unique_days  mean_interval_days  \
count  6040.000000  6040.000000  6040.000000         6040.000000   
mean   3020.500000   165.498510     3.773013            0.598515   
std    1743.742145   192.543909     9.773912            1.978140   
min       1.000000    18.000000     1.000000            0.000000   
25%    1510.750000    44.000000     1.000000            0.000000   
50%    3020.500000    96.000000     1.000000            0.000000   
75%    4530.250000   207.250000     3.000000            0.140000   
max    6040.000000  2277.000000   263.000000           42.520000   

       mode_interval_days  mode_interval_count  unique_rating_ratio  
count              6040.0          6040.000000          6040.000000  
mean                  0.0           162.072517             0.027843  
std                   0.0           189.005208             0.031803  
min                   0.0            15.000000             0.000864  
25%                   0.0            

  0%|          | 0/281460 [00:00<?, ?it/s]

  0%|          | 0/281460 [00:00<?, ?it/s]

         num_ratings    unique_days  mean_interval_days  mode_interval_days  \
count  281460.000000  281460.000000       281460.000000       281460.000000   
mean       12.631546      10.378278          130.379019           14.661277   
std        22.722047      16.605294           95.487507           32.698855   
min         5.000000       1.000000            0.000000            0.000000   
25%         6.000000       5.000000           59.170000            0.000000   
50%         8.000000       7.000000          109.800000            1.000000   
75%        12.000000      10.000000          180.500000           14.000000   
max      1535.000000     873.000000          648.250000          842.000000   

       mode_interval_count  unique_rating_ratio  
count        281460.000000        281460.000000  
mean              2.930004             0.862830  
std               8.503358             0.204902  
min               1.000000             0.007353  
25%               1.000000            

  0%|          | 0/24303 [00:00<?, ?it/s]

  0%|          | 0/24303 [00:00<?, ?it/s]

        num_ratings   unique_days  mean_interval_days  mode_interval_days  \
count  24303.000000  24303.000000        24303.000000        24303.000000   
mean       9.537094      6.581286          141.134365            8.987368   
std       12.789697      9.497773          162.811280           32.978317   
min        5.000000      1.000000            0.000000            0.000000   
25%        5.000000      3.000000           29.430000            0.000000   
50%        7.000000      5.000000           86.000000            0.000000   
75%       10.000000      7.000000          193.000000            1.000000   
max      773.000000    511.000000         1279.250000          988.000000   

       mode_interval_count  unique_rating_ratio  
count         24303.000000         24303.000000  
mean              3.263630             0.689185  
std               5.649502             0.268763  
min               1.000000             0.032258  
25%               1.000000             0.500000  
50%   

  0%|          | 0/50677 [00:00<?, ?it/s]

  0%|          | 0/50677 [00:00<?, ?it/s]

        num_ratings   unique_days  mean_interval_days  mode_interval_days  \
count  50677.000000  50677.000000        50677.000000        50677.000000   
mean       8.969138      5.316988          155.240220            7.795193   
std       10.426619      7.449149          183.327107           34.382366   
min        5.000000      1.000000            0.000000            0.000000   
25%        5.000000      2.000000           26.540000            0.000000   
50%        6.000000      4.000000           96.750000            0.000000   
75%        9.000000      6.000000          214.500000            0.000000   
max      774.000000    544.000000         1584.750000          947.000000   

       mode_interval_count  unique_rating_ratio  
count         50677.000000         50677.000000  
mean              3.851866             0.603735  
std               5.440380             0.285724  
min               1.000000             0.012195  
25%               1.000000             0.375000  
50%   

# Save For SASRec

In [96]:
sasrec_data_path = "../data"
os.makedirs(sasrec_data_path, exist_ok=True) 

In [97]:
def write_dataset(frame, dirr, name):
    frame.sort_values(by=['user', 'timestamp'], inplace=True)

    frame["uid"] = pd.Categorical(frame.user).codes + 1
    frame["iid"] = pd.Categorical(frame.item).codes + 1

    frame[['uid', 'iid']].to_csv(os.path.join(dirr, name),
                                 sep=' ',
                                 header=False,
                                 index=False)

In [None]:
for name, frame in cored_frames.items():
    print(f"Writing {name}...")
    write_dataset(frame, sasrec_data_path, name+".txt")
    print("Done!")

In [105]:
def write_rating_dataset(frame, dirr, name):
    frame.sort_values(by=['user', 'timestamp'], inplace=True)

    frame["uid"] = pd.Categorical(frame.user).codes + 1
    frame["iid"] = pd.Categorical(frame.item).codes + 1

    frame[['uid', 'iid', 'rating']].to_csv(os.path.join(dirr, name),
                                           sep=' ',
                                           header=False,
                                           index=False)

In [106]:
write_rating_dataset(cored_ml1m_frame, sasrec_data_path, "ml-1m-rating.txt")