In [134]:
import requests
import zipfile 
import io
import os
import gzip
import urllib.request

import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

# Download Datasets

In [32]:
movielens1m_zip_file_url = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
movielens25m_zip_file_url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
steam_gzip_file_url = "http://cseweb.ucsd.edu/~wckang/steam_reviews.json.gz"
videogames_2018_file_url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games.csv"
beauty_2018_file_url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv"
videogames_2014_file_url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Video_Games.csv"
beauty_2014_file_url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Beauty.csv"

In [33]:
save_to_path = "../raw_data"
os.makedirs(save_to_path, exist_ok=True) 

In [34]:
def save_csv(file_url, save_to_path="./data/", name='dataset'):
    try:
        df = pd.read_csv(file_url)
        write_path = os.path.join(save_to_path, name+".csv")
        df.to_csv(write_path)
        return write_path
    except Exception as e:
        print(e)

In [35]:
def save_zip(zip_file_url, save_to_path="./data/", name='dataset'):
    try:
        r = requests.get(zip_file_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        write_path = os.path.join(save_to_path)
        z.extractall(write_path)
        return write_path
    except Exception as e:
        print(e)

In [38]:
def save_gz(gz_file_url, save_to_path="./data/", name='dataset'):
    """
    Adapted from https://stackoverflow.com/a/61195974/9872987
    """
    # Download archive
    try:
        # Read the file inside the .gz archive located at url
        with urllib.request.urlopen(gz_file_url) as response:
            with gzip.GzipFile(fileobj=response) as uncompressed:
                file_content = uncompressed.read()
        # write to file in binary mode 'wb'
        write_path = os.path.join(save_to_path, name + ".txt")
        with open(write_path, 'wb') as f:
            f.write(file_content)
            return write_path

    except Exception as e:
        print(e)
        return 1

In [12]:
video_games_2018_path = save_csv(videogames_2018_file_url, save_to_path, 'video_games_2018_raw')

In [17]:
video_games_2014_path = save_csv(videogames_2014_file_url, save_to_path, 'video_games_2014_raw')

In [18]:
beauty_2014_path = save_csv(beauty_2014_file_url, save_to_path, 'beauty_2014_raw')

In [19]:
beauty_2018_path = save_csv(beauty_2018_file_url, save_to_path, 'beauty_2018_raw')

In [24]:
ml1m_path = save_zip(movielens1m_zip_file_url, save_to_path)

In [26]:
ml25m_path = save_zip(movielens25m_zip_file_url, save_to_path)

In [40]:
steam_path = save_gz(steam_gzip_file_url, save_to_path, 'steam_raw')

# Pull in raw data

## Read ML-1M

In [68]:
ml1m_df = pd.read_csv(
    os.path.join(ml1m_path, "ml-1m/ratings.dat"),
    names=["user", "item", "rating", "timestamp"],
    sep="::", 
    engine='python'
)

## Read Video 2018

In [50]:
video_2018_df = pd.read_csv(
    videogames_2018_file_url, 
    names=["user", "item", "rating", "timestamp"]
)

## Read Video 2014

In [51]:
video_2014_df = pd.read_csv(
    videogames_2014_file_url, 
    names=["user", "item", "rating", "timestamp"]
)

## Read Beauty 2014

In [53]:
beauty_2014_df = pd.read_csv(
    beauty_2014_file_url, 
    names=["user", "item", "rating", "timestamp"]
)

## Read Beauty 2018

In [54]:
beauty_2018_df = pd.read_csv(
    beauty_2018_file_url, 
    names=["user", "item", "rating", "timestamp"]
)

## Read Steam

In [89]:
import numpy as np

In [91]:
test = pd.DataFrame([[np.NaN, 1], [1, 1]])

In [94]:
test[0].fillna("fill")

0    fill
1       1
Name: 0, dtype: object

In [97]:
def load_steam(path_to_steam):
    """
    Load steam dataset and standardize format
    
    This will take some time!
    """
    data = []
    with open(path_to_steam) as f:
        for i, line in tqdm(enumerate(f)):
            line = eval(line)
            data.append(line)
    steam_df = pd.DataFrame(data)    
    return steam_df

In [98]:
steam_df = load_steam(steam_path)

|          | 0/? [00:00<?, ?it/s]

In [127]:
uids_counts = steam_df[["username", "user_id"]].groupby("username").count()

In [160]:
sum((uids_counts == 0).reset_index().user_id)

1259028

In [124]:
sum(steam_df.user_id.isna()) / len(steam_df)

0.5924297603421708

In [110]:
steam_df.username.nunique()

2567538

In [111]:
steam_df.user_id.nunique()

1485611

In [115]:
steam_df[steam_df.user_id=="76561198120382965"]

Unnamed: 0,username,hours,products,product_id,page_order,date,text,early_access,page,found_funny,compensation,user_id
49267,123,4.6,243.0,35140,3,2016-06-30,"Great game, with a good story. Lots to explore...",False,242,,,76561198120382965
2852849,123,2.8,243.0,51010,6,2016-06-30,"Love the game, I used to play the free version...",False,2,,,76561198120382965
2881877,123,1.3,243.0,360870,8,2015-12-03,10/10 great way to pass the time when your boa...,False,28,1.0,,76561198120382965
3196136,123,20.7,243.0,270370,2,2016-06-30,"Really fun to play with friends, combine is de...",False,46,,,76561198120382965
6937125,123,2.1,243.0,225540,9,2017-07-02,The gliding challenges are pretty dank.,False,332,,,76561198120382965


In [109]:
steam_df[steam_df.username == "123"]

Unnamed: 0,username,hours,products,product_id,page_order,date,text,early_access,page,found_funny,compensation,user_id
29045,123,97.9,103.0,1610,1,2014-02-02,One of the best space games ive ever played,False,9,,,76561198021444245
29510,123,484.1,69.0,271590,1,2017-12-30,"Don't buy. They don't bother fixing bugs, only...",False,107,,,76561198072738316
47279,123,7.9,19.0,427730,8,2016-12-29,the game is boring without freinds but with th...,False,205,,,76561198141516113
49267,123,4.6,243.0,35140,3,2016-06-30,"Great game, with a good story. Lots to explore...",False,242,,,76561198120382965
51651,123,68.5,145.0,350280,5,2017-08-14,I enjoy games again.,False,239,,,76561198194763874
...,...,...,...,...,...,...,...,...,...,...,...,...
7775065,123,75.1,135.0,252490,6,2014-04-16,This game is awsome and still in beta there ar...,True,8421,1.0,,76561198084641085
7776153,123,606.4,99.0,252490,4,2014-04-03,much fun. worth much money. much fun with frie...,True,8530,,,
7778239,123,1413.7,22.0,252490,0,2014-03-10,good game,True,8739,,,76561198125774672
7779101,123,77.2,23.0,252490,2,2014-03-03,I strongly reccomend this game I have had it f...,True,8825,,,


In [108]:
steam_df.username.value_counts()

123                 2045
Alex                1776
Nick                1465
Chris               1458
Tom                 1365
                    ... 
pixlewizard            1
weekly5112             1
ShinichiTheSlayr       1
CrossEyedSam           1
✪euphoria -iwnl-       1
Name: username, Length: 2567538, dtype: int64

In [116]:
example_steam = steam_df.copy()
example_steam.dropna(subset=['user_id'], inplace=True)

In [118]:
example_steam.user_id.nunique()

1485611

In [120]:
example_steam = example_steam[['user_id', 'product_id', 'date']].rename(
        columns={"user_id":"user", "product_id": "item", "date":"timestamp"})

In [99]:
def preprocess_steam(frame)
    # Fill NAN values of user_id with empty string
    frame.user_id.fillna("", inplace=True)
    
    # Create unique user identifier
    frame['unique_username'] = frame[["username", "user_id"]].apply(lambda x: "_".join(x), axis=1)
    
    # Standardize column names and keep just the columns we care about
    frame = frame[['unique_username', 'product_id', 'date']].rename(
        columns={"unique_username":"user", "product_id": "item", "date":"timestamp"}).copy()
    return frame

SyntaxError: invalid syntax (4170182245.py, line 1)

In [82]:
steam_df

Index(['user', 'item', 'timestamp'], dtype='object')

# Processing

In [60]:
def filter_by_interactions(frame, min_i=5, user_col='user', item_col='item'):
    
    keep_users = set(frame[frame[user_col].map(
        frame[user_col].value_counts()) >= min_i][user_col].unique())
    
    keep_items = set(frame[frame[item_col].map(
        frame[item_col].value_counts()) >= min_i][item_col].unique())
    
    frame = frame[frame[user_col].isin(keep_users) & frame[item_col].isin(keep_items)]
    return frame

In [61]:
def n_core_frame(frame, n=5, verbose=False):
    i = 1
    while frame.user.value_counts().min() <= 5 or frame.item.value_counts().min() <= 5:
        i += 1
        if verbose:
            print("Iter: ", i)
        frame = filter_by_interactions(frame)
        if verbose:
            print("\n")
            print("users: ", frame.user.nunique())
            print("items: ", frame.item.nunique())
            print("min user:", frame.user.value_counts().min())
            print("min item:", frame.item.value_counts().min())
        if frame.user.value_counts().min() >= 5 and frame.item.value_counts().min() >= 5:
            break            
    return frame

In [62]:
def dataset_stats(frame):
    mean_items_per_user = frame.groupby(by='user').count().item.mean()
    mean_user_per_items = frame.groupby(by='item').count().user.mean()
    stats = {
        "num_interactions": len(frame),
        "num_users": frame.user.nunique(),
        "num_items": frame.item.nunique(),
        "mean_items_per_user": round(mean_items_per_user, 3),
        "mean_user_per_items": round(mean_user_per_items, 3)
    }
    return stats

In [63]:
dataframes = [ml1m_df, beauty_2014_df]

In [73]:
def process_interactions(frame):
    frame = frame.copy()
    frame.drop_duplicates(subset=['user', 'item', 'timestamp'], inplace=True)
    frame.sort_values(by=['user', 'timestamp'], inplace=True)
    cored_frame = n_core_frame(frame, n=5)
    stats = dataset_stats(cored_frame)
    return cored_frame, stats

In [74]:
cored_ml1m_frame, ml1m_cored_stats = process_interactions(ml1m_df)

In [75]:
cored_ml1m_frame

Unnamed: 0,user,item,rating,timestamp
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


In [76]:
cored_steam_frame, steam_cored_stats = process_interactions(steam_df)

In [125]:
cored_steam_frame

Unnamed: 0,user,item,timestamp
2697132,!,421670,2016-03-04
2371759,!,377160,2016-03-31
436692,!,227940,2016-07-21
499895,!,304240,2016-09-15
2211182,!,550650,2017-10-25
...,...,...,...
1469646,󰀕 Africa's Population 󰀕,242680,2015-03-23
2265710,󰀕 Africa's Population 󰀕,99900,2015-09-16
4269027,󰀕 Africa's Population 󰀕,268910,2017-10-11
5451032,󰀕 Africa's Population 󰀕,252610,2017-11-22


In [80]:
cored_steam_frame.user.value_counts()

Alex                 1535
123                  1490
Nick                 1327
Chris                1302
Saint Aaron          1226
                     ... 
-tejp                   5
Siepen                  5
kursku                  5
The Regal Seagull       5
annainthedark           5
Name: user, Length: 281460, dtype: int64

In [79]:
steam_cored_stats

{'num_interactions': 3555275,
 'num_users': 281460,
 'num_items': 11961,
 'mean_items_per_user': 12.632,
 'mean_user_per_items': 297.239}

In [121]:
cored_es_frame, es_cored_stats = process_interactions(example_steam)

In [122]:
cored_es_frame.user.value_counts()

76561198011965365    1126
76561198094803808     820
76561197969749884     723
76561198094321628     609
76561198073092169     508
                     ... 
76561198005408205       5
76561197973609295       5
76561197970643396       5
76561198052573383       5
76561198085653640       5
Name: user, Length: 91591, dtype: int64

In [123]:
es_cored_stats

{'num_interactions': 892224,
 'num_users': 91591,
 'num_items': 8542,
 'mean_items_per_user': 9.741,
 'mean_user_per_items': 104.451}

In [None]:
cored_es_frame

## Dataset Stats

In [128]:
def dataset_stats(frame):
    mean_items_per_user = frame.groupby(by='user').count().item.mean()
    median_items_per_user = frame.groupby(by='user').count().item.median()
    mean_user_per_items = frame.groupby(by='item').count().user.mean()
    density = np.format_float_scientific(len(frame) / (frame.user.nunique() * frame.item.nunique()), 2)
    stats = {
        "num_interactions": len(frame),
        "num_users": frame.user.nunique(),
        "num_items": frame.item.nunique(),
        "mean_items_per_user": round(mean_items_per_user, 3),
        "mean_user_per_items": round(mean_user_per_items, 3),
        "density": density,
        "median_items_per_user":median_items_per_user
    }
    
    return stats

In [129]:
dataset_stats(cored_es_frame)

{'num_interactions': 892224,
 'num_users': 91591,
 'num_items': 8542,
 'mean_items_per_user': 9.741,
 'mean_user_per_items': 104.451,
 'density': '1.14e-03',
 'median_items_per_user': 7.0}

In [148]:
from scipy import stats

In [149]:
def interval_stats(time_list):        
    
    intervals = [(time_list[i+1] - time_list[i]).days for i in range(len(time_list)-1)]    
        
    mode, mode_count = stats.mode(intervals)
    mean = np.round(np.mean(intervals), 2)
    countx = len(time_list)
    countx_unique = len(set([x.date() for x in time_list]))
    
    outs = {
        "num_ratings": countx, 
        "unique_days": countx_unique, 
        "mean_interval_days": mean,
        "mode_interval_days": mode[0],
        "mode_interval_count": mode_count[0],
        "intervals": intervals
    }
        
    return outs

def interval_stats_frame(frame):
    
    user_timelists = frame.groupby('user').progress_apply(lambda x: x.datetime.tolist())
    user_timelist_df = user_timelists.reset_index(name='timelists')
    
    interval_stats_df = pd.concat([
        user_timelist_df, 
        pd.DataFrame(list(user_timelist_df.timelists.progress_apply(interval_stats)))], 
        axis=1)
    
    interval_stats_df["unique_rating_ratio"] = interval_stats_df.unique_days / interval_stats_df.num_ratings
    
    return interval_stats_df

In [150]:
cored_es_frame['datetime'] = pd.to_datetime(cored_es_frame['timestamp'])

In [151]:
es_intervals = interval_stats_frame(cored_es_frame)

  0%|          | 0/91591 [00:00<?, ?it/s]

  0%|          | 0/91591 [00:00<?, ?it/s]

# Save