## Preprocessing
1. Need to remove timestamps
2. move ids are not consecutive, so we need to make them consecutive.
3. user IDs are not starting from 0. Need to make it 1.
4. Then, make a "very small data set" from the most common K users & movies

In [None]:
import os
import sys

import inspect
import pandas as pd

# currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
currentdir = %pwd
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

# Define the path to the CSV file
csv_path = parentdir+'/movielens_20/rating.csv'
movie_lens_path = parentdir+"/movielens_20"
edited_csv_path = movie_lens_path + "/edited_rating.csv"

if os.path.exists(edited_csv_path):
    print(f'Reading edited csv')
    df = pd.read_csv(edited_csv_path)
    print(f'Successfully read edited csv')
else:
    print(f"creating edited csv")
    # might take a while
    df = pd.read_csv(csv_path)
    df = df.drop("timestamp", axis=1)

    # reassign movie id
    movie_ids = set(df.movieId)
    movie_id_lookup = {}
    for i, movie_id in enumerate(movie_ids):
        movie_id_lookup[movie_id] = i
    # This might take a while
    df["movieId"] = df.apply(lambda row: movie_id_lookup[row.movieId], axis = 1)
    df.userId = df.userId - 1
    df.to_csv(edited_csv_path)
    print(f'Successfully generated edited csv')


small_edited_csv_path = parentdir+"/movielens_20/small_edited_rating.csv"
if os.path.exists(small_edited_csv_path):
    print(f'reading small csv for training and testing')
    small_df = pd.read_csv(small_edited_csv_path)
else:
    print(f'generating small csv for training and testing')
    from collections import Counter
    user_counts = Counter(df.userId)
    movie_counts = Counter(df.movieId)
    common_user_ids = [u for u, c in user_counts.most_common(100)]
    common_movie_ids = [m for m, c in movie_counts.most_common(20)]

    # Carve the data out
    # df also by default does not create copies. 
    small_df = df[df.userId.isin(common_user_ids) & df.movieId.isin(common_movie_ids)].copy()
    # remap movie and user IDs

    def remap_to_consecutive_numbers(df, cln_name):
        df_cln = df[cln_name]
        unique_cln = set(df_cln)
        lookup = {}
        for i, old_id in enumerate(unique_cln):
            lookup[old_id] = i
        # WEIRD: must use df[cln_name] instead of df_cln
        # This might take a while
        df[cln_name] = df.apply(lambda row: lookup[row[cln_name]], axis = 1)
        return lookup

    remap_to_consecutive_numbers(small_df, "userId")
    remap_to_consecutive_numbers(small_df, "movieId")
    small_df.to_csv(small_edited_csv_path)
print(f'small df loaded')

## Runtime Behaviors
1. From very small data set, split rows/columns into trainin

In [None]:
from sklearn.utils import shuffle
from collections import defaultdict
import pickle


user2movies_path = parentdir+"/movielens_20/user2movies.pkl"
movies2user_path = parentdir+"/movielens_20/movies2user.pkl"
ratings_path = parentdir+"/movielens_20/ratings.pkl"
train_df_path = parentdir+"/movielens_20/train_df.csv"
test_df_path = parentdir+"/movielens_20/test_df.csv"

DIVIDE_DATA = False
for path in (user2movies_path, movies2user_path, ratings_path, train_df_path, test_df_path):
    if not os.path.exists(path):
        DIVIDE_DATA = True
        break

if DIVIDE_DATA:
    #TODO Remember to remove
    print(f'Dividing and saving data ...')
    small_df = shuffle(small_df)
    TEST_PERCENTAGE = 0.8
    cutoff = int(len(small_df) * TEST_PERCENTAGE)
    train_df = small_df.iloc[:cutoff]
    test_df = small_df.iloc[cutoff:]
    user2movies = defaultdict(lambda: set())
    movies2user = defaultdict(lambda: set())
    ratings = {}
    # use apply, it's faster
    def update_mapping(row):
        user_id = int(row.userId)
        movie_id = int(row.movieId)
        user2movies[user_id].add(movie_id)
        movies2user[movie_id].add(user_id)
        key = (int(row.userId), int(row.movieId))
        ratings[key] = row.rating
    train_df.apply(update_mapping, axis=1)

    with open(user2movies_path, "wb") as f:
        pickle.dump(dict(user2movies), f)
    with open(movies2user_path, "wb") as f:
        pickle.dump(dict(movies2user), f)
    with open(ratings_path, "wb") as f:
        pickle.dump(dict(ratings), f)
    train_df.to_csv(train_df_path)
    test_df.to_csv(test_df_path)
else:
    #TODO Remember to remove
    print(f'Loading training and test data ... ')
    with open(user2movies_path, "rb") as f:
        user2movies = pickle.load(f)
    with open(movies2user_path, "rb") as f:
        movies2user = pickle.load(f)
    with open(ratings_path, "rb") as f:
        ratings = pickle.load(f)
    train_df = pd.read_csv(train_df_path)
    test_df = pd.read_csv(test_df_path)
#TODO Remember to remove
print(f'Training and testing data loaded')
    
    

In [None]:
from collections import namedtuple
import numpy as np
from typing import Set
import heapq
import time

NEIGHBOR_NUM = 25
# threshold on number of movies for neightbors
COMMON_MOVIE_THRE = 5
DEBUG_USER_THRE = 3
Weight = namedtuple("Weight",["value", "name"])

class User2UserTraining:
    def __init__(self):
        print(f'Loading Training model')
        self.training_results_path = os.path.join(movie_lens_path, "user2user_training_results.pkl")
        self.training_results_index = 1
        self.debug_count = 0
        self.loaded_objs = []

    def initialize_model(self):
        for i in range(self.training_results_index, 0, -1):
            training_results_path = self.training_results_path + "." + str(i)
            with open(training_results_path, "rb") as f:
                while True:
                    try:
                        self.loaded_objs.append(pickle.load(f))
                    except EOFError:
                        break 
            
            if self.loaded_objs:
                self.weight_heapqs, self.means, self.stds, self.movie_devs, self.compared = self.loaded_objs
                print(f'Training model loaded successfully, from {training_results_path}')
                return 

        # User data are stored in lists, because the indices have been normalized
        self.weight_heapqs = [[Weight(float('-inf'), "") for _ in range(NEIGHBOR_NUM)] for _ in range (len(user2movies))] 
        self.means = [float('-inf') for _ in range (len(user2movies))] 
        self.stds = [float('-inf') for _ in range (len(user2movies))] 
        # deviations of each user. A movie deviation is rating - mean
        self.movie_devs = [{} for _ in range (len(user2movies))]
        # set that stores frozenset(user_i, user_i')
        self.compared = set()
        print(f'No training model found, initialized model objects')

    def save_model(self):
        """
        Save the model to different files: PATH.0, PATH.1
        """
        if self.training_results_index % 2 == 0:
            self.training_results_index = 0
        training_results_path = self.training_results_path+"."+str(self.training_results_index)
        with open(training_results_path, "wb") as f:
            for obj in self.weight_heapqs, self.means, self.stds, self.movie_devs, self.compared:
                pickle.dump(obj, f)
        self.training_results_index += 1  

    def get_and_update_mean_std(self,user: int, movies: Set[int]):
        """Update means and stds, and return them. Nice"""
        if self.means[user] == float('-inf') or self.stds[user] != float('-inf'):
            user_all_ratings = np.array([
                ratings[(int(user), int(movie_id))] for movie_id in movies])
            self.means[user] = np.mean(user_all_ratings)
            # standard deviation of all user ratings
            self.stds[user] = np.std(user_all_ratings)
        return self.means[user], self.stds[user]

    def get_and_update_movie_devs(self, mean: int, user: int, movies: Set[int]):
        """Update movie deviations for a single user. A movie deviation is rating - mean"""
        if not self.movie_devs[user]:
            user_all_ratings = np.array([
                ratings[(int(user), int(movie_id))] for movie_id in movies])
            dev_ratings = user_all_ratings - mean
            self.movie_devs[user] = {movie: dev_rating for movie, dev_rating in zip(movies, dev_ratings)}
        return self.movie_devs[user]

    def has_been_trained(self, user: int):
        """
        We are using self.movie_devs to check if a user's model has been trained. 
        Since the kernel might be killed accidentally, this function might return true on the last user that kernel broke. 
        But technically, that should be only one user.
        """
        if self.movie_devs[user]:
            return True
        else:
            return False
    
    def train(self):
        start_time = time.perf_counter()
        for user, movies in user2movies.items():
            if self.debug_count % 10 == 0:
                self.save_model()
                print(f"User count: {self.debug_count}, time elapsed: {time.perf_counter() - start_time}s")
            self.debug_count += 1
            if self.has_been_trained(user):
                continue
            mean, std = self.get_and_update_mean_std(user, movies)
            movie_devs_dict = self.get_and_update_movie_devs(mean, user, movies)
            for movie_id in movies:
                other_users = movies2user[movie_id]
                for another_user in other_users:
                    # See if we have compared 
                    key_compared = frozenset((user, another_user))
                    if key_compared in self.compared:
                        continue
                    self.compared.add(key_compared)
                    another_user_movies = user2movies[another_user]
                    common_movies = movies & another_user_movies

                    if len(common_movies) < COMMON_MOVIE_THRE:
                        continue

                    if another_user == user:
                        continue 
                    # calculate weight: 
                    # HACK: we are calculating weights using the stds and movie_devs of all users' movies, instead of the common ones
                    # Then, a weight is calculated by multiplying sums of deviations of the common movies
                    # That's been tested fine in production. So, this may not work great if the common movie's std deviations are drastically
                    # different from the two users over standard deviation 
                    another_mean, another_std = self.get_and_update_mean_std(another_user, another_user_movies)
                    another_movie_devs_dict = self.get_and_update_movie_devs(another_mean, another_user, another_user_movies)
                    numerator = sum([movie_devs_dict[m] * another_movie_devs_dict[m] for m in common_movies])/len(common_movies)
                    denominator = another_std * std

                    if denominator != 0.0:
                        w_ij = numerator/denominator

                    # add to heapque
                    heapq.heappushpop(self.weight_heapqs[user], Weight(w_ij, another_user))
                    heapq.heappushpop(self.weight_heapqs[another_user], Weight(w_ij, user))

    def _predict(self, user_id: int, movie_id: int):
        total_weights = 0.0
        total_weighted_dev = 0.0
        for neighbor in self.weight_heapqs[user_id]:
            total_weights += abs(neighbor.value)
            try:
                dev = self.movie_devs[neighbor.name][movie_id]
                total_weighted_dev += neighbor.value * dev
            except KeyError:
                continue

        if total_weights != 0:
            return self.means[user_id] + total_weighted_dev / total_weights
        else:
            return self.means[user_id]

    def predict(self):
        self.predicted_results = [{} for _ in range(len(self.weight_heapqs))]
        for _, row in train_df.iterrows():
            user_id, movie_id = int(row.userId), int(row.movieId)
            predicted_rating = self._predict(user_id=user_id, movie_id=movie_id)
            self.predicted_results[user_id][movie_id] = predicted_rating


t = User2UserTraining()  
t.initialize_model()
# t.train()
t.predict()

for user_id, predictions in enumerate(t.predicted_results):
    if predictions:
        for movie_id, prediction in predictions.items():
            #TODO Remember to remove
            print(f'User {user_id} actual: {ratings[(user_id, movie_id)]}, predicted: {prediction}, avg {t.means[user_id]}') 

In [None]:
from collections import namedtuple
import numpy as np
from typing import Set
import heapq
import time

NEIGHBOR_NUM = 25
# threshold on number of movies for neightbors
COMMON_MOVIE_THRE = 5
DEBUG_USER_THRE = 3
Weight = namedtuple("Weight",["value", "name"])

class Item2ItemTraining:
    def __init__(self):
        print(f'Loading Training model')
        self.training_results_path = os.path.join(movie_lens_path, "item2item_training_results.pkl")
        self.training_results_index = 1
        self.debug_count = 0
        self.loaded_objs = []

    def initialize_model(self):
        for i in range(self.training_results_index, 0, -1):
            training_results_path = self.training_results_path + "." + str(i)
            with open(training_results_path, "rb") as f:
                while True:
                    try:
                        self.loaded_objs.append(pickle.load(f))
                    except EOFError:
                        break 
            
            if self.loaded_objs:
                self.weight_heapqs, self.means, self.stds, self.movie_devs, self.compared = self.loaded_objs
                print(f'Training model loaded successfully, from {training_results_path}')
                return 

        # User data are stored in lists, because the indices have been normalized
        self.weight_heapqs = [[Weight(float('-inf'), "") for _ in range(NEIGHBOR_NUM)] for _ in range (len(user2movies))] 
        self.means = [float('-inf') for _ in range (len(user2movies))] 
        self.stds = [float('-inf') for _ in range (len(user2movies))] 
        # deviations of each user. A movie deviation is rating - mean
        self.movie_devs = [{} for _ in range (len(user2movies))]
        # set that stores frozenset(user_i, user_i')
        self.compared = set()
        print(f'No training model found, initialized model objects')

    def save_model(self):
        """
        Save the model to different files: PATH.0, PATH.1
        """
        if self.training_results_index % 2 == 0:
            self.training_results_index = 0
        training_results_path = self.training_results_path+"."+str(self.training_results_index)
        with open(training_results_path, "wb") as f:
            for obj in self.weight_heapqs, self.means, self.stds, self.movie_devs, self.compared:
                pickle.dump(obj, f)
        self.training_results_index += 1  

    def get_and_update_mean_std(self,user: int, movies: Set[int]):
        """Update means and stds, and return them. Nice"""
        if self.means[user] == float('-inf') or self.stds[user] != float('-inf'):
            user_all_ratings = np.array([
                ratings[(int(user), int(movie_id))] for movie_id in movies])
            self.means[user] = np.mean(user_all_ratings)
            # standard deviation of all user ratings
            self.stds[user] = np.std(user_all_ratings)
        return self.means[user], self.stds[user]

    def get_and_update_movie_devs(self, mean: int, user: int, movies: Set[int]):
        """Update movie deviations for a single user. A movie deviation is rating - mean"""
        if not self.movie_devs[user]:
            user_all_ratings = np.array([
                ratings[(int(user), int(movie_id))] for movie_id in movies])
            dev_ratings = user_all_ratings - mean
            self.movie_devs[user] = {movie: dev_rating for movie, dev_rating in zip(movies, dev_ratings)}
        return self.movie_devs[user]

    def has_been_trained(self, user: int):
        """
        We are using self.movie_devs to check if a user's model has been trained. 
        Since the kernel might be killed accidentally, this function might return true on the last user that kernel broke. 
        But technically, that should be only one user.
        """
        if self.movie_devs[user]:
            return True
        else:
            return False
    
    def train(self):
        start_time = time.perf_counter()
        for user, movies in user2movies.items():
            if self.debug_count % 10 == 0:
                self.save_model()
                print(f"User count: {self.debug_count}, time elapsed: {time.perf_counter() - start_time}s")
            self.debug_count += 1
            if self.has_been_trained(user):
                continue
            mean, std = self.get_and_update_mean_std(user, movies)
            movie_devs_dict = self.get_and_update_movie_devs(mean, user, movies)
            for movie_id in movies:
                other_users = movies2user[movie_id]
                for another_user in other_users:
                    # See if we have compared 
                    key_compared = frozenset((user, another_user))
                    if key_compared in self.compared:
                        continue
                    self.compared.add(key_compared)
                    another_user_movies = user2movies[another_user]
                    common_movies = movies & another_user_movies

                    if len(common_movies) < COMMON_MOVIE_THRE:
                        continue

                    if another_user == user:
                        continue 
                    # calculate weight: 
                    # HACK: we are calculating weights using the stds and movie_devs of all users' movies, instead of the common ones
                    # Then, a weight is calculated by multiplying sums of deviations of the common movies
                    # That's been tested fine in production. So, this may not work great if the common movie's std deviations are drastically
                    # different from the two users over standard deviation 
                    another_mean, another_std = self.get_and_update_mean_std(another_user, another_user_movies)
                    another_movie_devs_dict = self.get_and_update_movie_devs(another_mean, another_user, another_user_movies)
                    numerator = sum([movie_devs_dict[m] * another_movie_devs_dict[m] for m in common_movies])/len(common_movies)
                    denominator = another_std * std

                    if denominator != 0.0:
                        w_ij = numerator/denominator

                    # add to heapque
                    heapq.heappushpop(self.weight_heapqs[user], Weight(w_ij, another_user))
                    heapq.heappushpop(self.weight_heapqs[another_user], Weight(w_ij, user))

    def _predict(self, user_id: int, movie_id: int):
        total_weights = 0.0
        total_weighted_dev = 0.0
        for neighbor in self.weight_heapqs[user_id]:
            total_weights += abs(neighbor.value)
            try:
                dev = self.movie_devs[neighbor.name][movie_id]
                total_weighted_dev += neighbor.value * dev
            except KeyError:
                continue

        if total_weights != 0:
            return self.means[user_id] + total_weighted_dev / total_weights
        else:
            return self.means[user_id]

    def predict(self):
        self.predicted_results = [{} for _ in range(len(self.weight_heapqs))]
        for _, row in train_df.iterrows():
            user_id, movie_id = int(row.userId), int(row.movieId)
            predicted_rating = self._predict(user_id=user_id, movie_id=movie_id)
            self.predicted_results[user_id][movie_id] = predicted_rating


t = User2UserTraining()  
t.initialize_model()
# t.train()
t.predict()

for user_id, predictions in enumerate(t.predicted_results):
    if predictions:
        for movie_id, prediction in predictions.items():
            #TODO Remember to remove
            print(f'User {user_id} actual: {ratings[(user_id, movie_id)]}, predicted: {prediction}, avg {t.means[user_id]}') 


In [None]:
# TUTORIALS
# accessing a column
df["rating"]
df.rating
# how to filter
df[df.rating == 3.0]
# You need to specify which axis to drop. by default 0 (row), 1 for column, 
# df.drop("timestamp", axis=1)
# length of dataframe
print(f"length of dataframe: {len(df)}")
# access integer clns:
print(f"integer access: {df.iloc[2]}")


# subtract a number from the column: 
df.userId - 1
# count common occurences:
from collections import Counter
counter_set = Counter([1,2,2,3,3])
common_user_ids = counter_set.most_common(2)
print(f'Rico, common_user_ids: {common_user_ids}')
test_pd = pd.DataFrame([u for u,c in common_user_ids], columns=["test_column"])
print(f'Rico, pd dataframe common_user_ids: {common_user_ids}')

# select rows based on 2 conditions: 
df[df.userId.isin(common_user_ids) & df.movieId.isin(common_user_ids)]

# shuffle a list / dataframe
from sklearn.utils import shuffle
ls = shuffle([1,2,3,4,5])
#TODO Remember to remove
print(f'Rico: shuffled ls: {ls}')

# named tiple
from collections import namedtuple
Subscriber = namedtuple("some_name", ["addr", "name"])
sub = Subscriber("123 st", "Jo")
sub_new = sub._replace(addr="456st")
print("field: ", sub_new.addr) 
print(sub, sub_new)
print("converted to dictionary: ", sub_new._asdict())
# like tuple, after construction, cannot be modified
# sub.name = 30
# print(f'sub after change: {sub}')
#TODO Remember to remove
print(f'named_tuples can also be accessed through indexing: {sub[0]}')


# heapq
li = [Subscriber(0, "NULL")]*5
import heapq
for i in range(1, 7):
    heapq.heappush(li, Subscriber(i, "Rico"+str(i)))
    heapq.heappop(li)
# This way to make sure small value is popped 
heapq.heappushpop(li, Subscriber(1, "Rico1"))
print(f'{li}')

#TODO Remember to remove
print(f'smallest int: {float("-inf")}')

#dictionary length:
di = {1:2, 3:4, 5:6}
#TODO Remember to remove
print(f'len(di): {len(di)}')

    