# First attempt at learning

In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
import os
import random
import math
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
import pymc3 as pm
import theano
theano.config.compute_test_value = 'raise'
%matplotlib inline

SELECTED_DATA_DIR = "../selected-data/"
MOVIES_FILE = "best_movie_ratings_features_engineered.csv"
USERS_FILE = "users_ratings.csv"

## Read data

In [None]:
movies = pd.read_csv(SELECTED_DATA_DIR + MOVIES_FILE, index_col=0)
movies.rating = movies.rating/10
movies.sample()

In [None]:
users = pd.read_csv(SELECTED_DATA_DIR + USERS_FILE, index_col=0)
users.rating = users.rating/10
users.sample()

## Learning

In [None]:
def utility(user_features, movie_features, epoch, s=1):
    """ Compute utility based on user preferences and movie preferences """
    return user_features.dot(movie_features) * (1 - math.exp(-epoch/s))

def get_movie_features(movie):
    """ selected features from dataframe """
    if isinstance(movie, pd.Series):
        return movie[-50:]
    elif isinstance(movie, pd.DataFrame):
        return get_movie_features(movie.loc[movie.index[0]])
    else:
        raise TypeError("{} should be a Series or DataFrame".format(movie))
    
def best_recommandation(user_features, movies, epoch):
    """ Return the movie with the highest utility """
    utilities = np.zeros(movies.shape[0])
    for i, (title, movie) in enumerate(movies.iterrows()):
        movie_features = get_movie_features(movie)
        utilities[i] = utility(user_features, movie_features, epoch - movie.last_t)
    return movies[movies.index == movies.index[utilities.argmax()]]

def all_recommandation(user_features, movies):
    """ Return all movies sorted by utility """
    movies = movies.copy()
    movies['utilities'] = movies.apply(lambda mov: utility(user_features, get_movie_features(mov), 1000), axis=1)
    return movies.sort_values(by="utilities")


def greedy_choice(user_features, movies, epoch):
    """ greedy approach to the problem """
    epsilon = 1 / math.sqrt(epoch+1)
    if random.random() > epsilon: # choose the best
        return best_recommandation(user_features, movies, epoch)
    else:
        return movies.sample()

def greedy_choice_no_t(user_features, movies, epsilon=0.5):
    """ greedy approach to the problem """
    if random.random() > epsilon: # choose the best
        return best_recommandation(user_features, movies)
    else:
        return movies.sample()
        
def iterative_mean(old, new, t):
    """ Compute the new mean """
    return ((t-1) / t) * old + (1/t) * new
    
def update_features(user_features, movie_features, rating, t):
    """ update the user preferen """
    return iterative_mean(user_features, movie_features * rating, t+1)

def train_user(user, movies):
    user_features = np.zeros(movies.shape[1] - 2)
    movies = movies.copy()
    movies.insert(0, 'last_t', np.ones(movies.shape[0]).astype(np.int64))
    for t in tqdm(range(1000)):
        recommandation = greedy_choice(user_features, movies, t)
        recommandation_features = get_movie_features(recommandation)
        user_rating = user.get_value(recommandation.index[0], "rating")
        user_features = update_features(user_features, recommandation_features, user_rating, t)
        movies.loc[movies.index.isin(recommandation.index),'last_t'] = t
    return user_features

def test_user(user, user_features, movies):
    allrec = all_recommandation(user_features, movies).utilities.sort_index()
    user_ratings = user[user.index.isin(movies.index)].sort_values(by="rating").rating.sort_index()
    print(spearmanr(allrec.rank(), user_ratings.rank()))
    return math.sqrt(mean_squared_error(allrec.as_matrix(), user_ratings.as_matrix()))

## One user

### Selection

In [None]:
user = users[users.user.isin(users.user.sample())]
user.shape

In [None]:
# get only movies that this user rated
movies_user = movies[movies.index.isin(user.index)]
movies_user.shape

### Split Train/Test

In [None]:
msk = np.random.rand(movies_user.shape[0]) < 0.8
train = movies_user.loc[msk]
test = movies_user.loc[~msk]

In [None]:
user_features = train_user(user, train)
user_features

### Score

In [None]:
error = test_user(user, user_features, test)
error

## Multiple users

In [None]:
N_USER=10
res_score = []
for i in tqdm(range(N_USER)):
    user = users[users.user.isin(users.user.sample())]
    movies_user = movies[movies.index.isin(user.index)]
    msk = np.random.rand(movies_user.shape[0]) < 0.8
    train = movies_user.loc[msk]
    test = movies_user.loc[~msk]
    user_features = train_user(user, train)
    error = test_user(user, user_features, test)
    res_score.append(error)

sum(res_score)/len(res_score)    

## Try Bayesian inference

In [None]:
size = 10
theta_prime = np.random.rand(size)
x = np.random.rand(size)

c0 = 10
d0 = 3
e0 = 0.01
f0 = 0.001
g0 = 0.001

t = 1

I = np.eye(size)
print(theta_prime)
print(x)

In [None]:
with pm.Model():
    s = pm.Gamma('s', d0, e0)
    sigma = pm.InverseGamma('sigma', f0, g0)
    theta = pm.MvNormal('theta', mu=0.5, cov=c0 * sigma * I)

    # Expected value of outcome
    mu = theta_prime.dot(x) * (1 - np.exp(-t/s))

    # Likelihood (sampling distribution) of observations
    rating = pm.Normal('rating', mu=mu, sd=sigma, observed=theta_prime)
    
    step = pm.Metropolis()
    trace = pm.sample(1000, step=step)

In [None]:
pm.summary(trace)
pm.traceplot(trace)

In [None]:
rating.distribution.random()

In [None]:
theta.distribution.random()