In [1]:
import csv
import os

import numpy as np

In [7]:
def load_csv(filename, delimeter='\t', encoding='UTF-8'):
    """
    :param filename: path inside data folder
    :return:
    """
    dirname = '/home/damian/Pulpit/Warsztat/CoML/recomm_project/customized/notebooks'
    filename = os.path.join(dirname, '../../data/{}'.format(filename))
    data = []

    with open(filename, newline='\n', encoding=encoding) as csvfile:
        reader = csv.reader(csvfile, delimiter=delimeter)

        for row in reader:
            data.append(row)

    return data

def preprocess(data):
    for i in range(len(data)):
        for j in range(len(data[i])):
            data[i][j] = int(data[i][j])

    data = np.array(data)
    data = data - np.array([1, 1, 0, 0])
    
    return data

def get_user_item_num(data, user_idx=0, item_idx=1):
    user_num = np.sort(data[:, user_idx])[-1] + 1
    item_num = np.sort(data[:, item_idx])[-1] + 1
    
    return user_num, item_num
    

def get_avg_ratings(data, user_num, user_idx=0):
    users_ratings = []
    
    for i in range(user_num):
        users_ratings.append(len(data[data[:, user_idx] == i]))
    
    users_ratings = np.array(users_ratings)
    avg, std = np.mean(users_ratings), np.std(users_ratings)
    min, max = np.min(users_ratings), np.max(users_ratings)
    
    return avg, std, min, max
        
    

In [8]:
ml100k_filename = 'ml-100k/u.data'
train_filename = 'ml-100k/ua.base'
test_filename = 'ml-100k/ua.test'

ml100k = load_csv(ml100k_filename)
train = load_csv(train_filename)
test = load_csv(test_filename)

In [9]:
ml100k = preprocess(ml100k)
train = preprocess(train)
test = preprocess(test)

In [10]:
def print_stats(data, name):
    print('==== Stats for {} data'.format(name))
    print('total number of ratings: {}'.format(len(data)))
    user_num, item_num = get_user_item_num(data)
    print('user = {}, item = {}'.format(user_num, item_num))
    avg, std, min, max = get_avg_ratings(data, user_num)
    print('avg number of user ratings: {}, std: {}, min: {}, max: {}'.format(avg, std, min, max))

In [11]:
print_stats(ml100k, 'ml100k')
print_stats(train, 'train')
print_stats(test, 'test')

==== Stats for ml100k data
total number of ratings: 100000
user = 943, item = 1682
avg number of user ratings: 106.04453870625663, std: 100.87821227051644, min: 20, max: 737
==== Stats for train data
total number of ratings: 90570
user = 943, item = 1682
avg number of user ratings: 96.04453870625663, std: 100.87821227051644, min: 10, max: 727
==== Stats for test data
total number of ratings: 9430
user = 943, item = 1664
avg number of user ratings: 10.0, std: 0.0, min: 10, max: 10
