In [1]:
import random
import re
from os import sep

import numpy as np
import pandas as pd
import requests
import tmdbsimple as tmdb
from scipy.stats import pearsonr
from surprise import SVD, Dataset, KNNBasic, Reader
from surprise.model_selection import cross_validate
from collections import defaultdict, Counter
from itertools import islice
import collections

In [2]:
tmdb.REQUESTS_SESSION = requests.Session()
tmdb.API_KEY = '1e11e7d4c5f3aad6e459fc0f63bfb0f5'
tmdb.REQUESTS_TIMEOUT = 5

In [3]:
users_df = pd.read_csv(
    "data/users.txt", names=["user_id", "age", "gender", "occupation"], sep="\t"
)

In [6]:
users_df[users_df.user_id == 32]['user_id'].tolist()

[32]

In [7]:
users_df

Unnamed: 0,user_id,age,gender,occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other
...,...,...,...,...
939,940,32,M,administrator
940,941,20,M,student
941,942,48,F,librarian
942,943,22,M,student


In [54]:
def get_user_type(gender, age, occupation):
    res = 0
    group1 = ['doctor', 'healthcare', 'entertainment']
    group2 = ['engineer', 'programmer', 'scientist', 'technician']
    group3 = ['artist', 'writer', 'librarian', 'homemaker']
    group4 = ['none', 'other', 'student', 'retired']
    group5 = ['executive', 'lawyer', 'administrator', 'salesman', 'marketing']

    if occupation in group1:
        if age < 23:
            res = 1 if gender == 'M' else 2
        elif 24 < age < 40:
            res = 3 if gender == 'M' else 4
        elif 41 < age < 60:
            res = 5 if gender == 'M' else 6
        else:
            res = 7 if gender == 'M' else 8
    elif occupation in group2:
        if age < 23:
            res = 9 if gender == 'M' else 9
        elif 24 < age < 40:
            res = 11 if gender == 'M' else 12
        elif 41 < age < 60:
            res = 13 if gender == 'M' else 14
        else:
            res = 15 if gender == 'M' else 16
    elif occupation in group3:
        if age < 23:
            res = 17 if gender == 'M' else 18
        elif 24 < age < 40:
            res = 19 if gender == 'M' else 20
        elif 41 < age < 60:
            res = 21 if gender == 'M' else 22
        else:
            res = 23 if gender == 'M' else 24
    elif occupation in group4:
        if age < 23:
            res = 25 if gender == 'M' else 26
        elif 24 < age < 40:
            res = 27 if gender == 'M' else 28
        elif 41 < age < 60:
            res = 29 if gender == 'M' else 30
        else:
            res = 31 if gender == 'M' else 32
    elif occupation in group5:
        if age < 23:
            res = 33 if gender == 'M' else 34
        elif 24 < age < 40:
            res = 35 if gender == 'M' else 36
        elif 41 < age < 60:
            res = 37 if gender == 'M' else 38
        else:
            res = 39 if gender == 'M' else 40

    return res - 1 if res > 9 else res

def movie_votes_demographic(grupos, ratings, movie_id, grupo):
    usuarios = [k for k, v in grupos.items() if v == grupo]
    return len(ratings[(ratings.user_id.isin(usuarios)) & (ratings.movie_id == movie_id)].index), ratings[(ratings.user_id.isin(usuarios)) & (ratings.movie_id == movie_id)].mean()['rating'], ratings[ratings.movie_id == movie_id].mean()['rating']

def genre_seen(films_df, ratings, user_id, genre_name):
    scores = []
    for film_id in ratings[ratings['user_id'] == user_id].movie_id.tolist():
        if films_df[films_df['movie_id'] == film_id][genre_name].tolist()[0] == 1:
            scores.append(ratings[(ratings['movie_id'] == film_id) & (ratings['user_id'] == user_id)]['rating'].tolist()[0])

    return sum(scores)/len(scores) if scores != [] else 0

def take(n, iterable):
    res = {}
    i = 0
    for k, v in iterable.items():
        if i >= n:
            break
        res[k] = v
        i += 1
        
    return res

def get_genres_score(users, generos, films_df, ratings):
    best_genres = {}
    for u in users:
        for i in generos['genre_name'].tolist():
            puntos = genre_seen(films_df, ratings, u, i)
            if puntos > 2.5:
                if i not in best_genres.keys():
                    best_genres[i] = puntos/len(users)
                else:
                    best_genres[i] += puntos/len(users)

    best_genres = {k: v for k, v in sorted(best_genres.items(), key=lambda item: item[1])}
    best_genres = take(6, best_genres)

    pref = []

    for genero in generos['genre_name'].tolist():
        add = False
        for k, v in best_genres.items():
            if genero == k:
                add = True
                break
        if add:
            pref.append(1)
        else:
            pref.append(0)

    return pref
    

def obtener_vecinos(preferencias, user, k=1):
    vecinos = [0]*k
    vecinos_score = [0]*k
    pref = preferencias[user]
    for i in range(0, preferencias.shape[0]):
        if i == user:
            continue
        pref_comp = np.matrix([pref, preferencias[i]])
        score = sum([1 if pref_comp[0,j] == pref_comp[1,j] else 0 for j in range(0, pref_comp.shape[1])])
        if score > min(vecinos_score):
            vecinos[vecinos_score.index(min(vecinos_score))] = i + 1
            vecinos_score[vecinos_score.index(min(vecinos_score))] = score
            
            
    return vecinos, vecinos_score

def get_dg_pref(dg, generos, films_df, ratings):
    res = []
    for x in set(dg.values()):
        u_dg = [k for k, v in dg.items() if v == x]
        scores = get_genres_score(u_dg, generos, films_df, ratings)
        res.append(scores)

    return np.matrix(res)
        
def get_recommendation(user, ratings, preferencias):
    vecino = obtener_vecinos(preferencias, 0, 1)
    pelis_user = ratings[ratings.user_id == user]['movie_id'].tolist()
    pelis_vecino = ratings[ratings.user_id == vecino[0][0]][['movie_id', 'rating']].sort_values(by=['rating'], ascending=False)['movie_id'].tolist()
    return [x for x in pelis_vecino if x not in pelis_user][:5]


In [8]:
grupos_demograficos = {}
for user_id in users_df.user_id.unique().tolist():
    user = users_df[users_df.user_id == user_id]
    user_demo_group = get_user_type(user.gender.tolist()[0], user.age.tolist()[0], user.occupation.tolist()[0])
    grupos_demograficos[user_id] = user_demo_group

In [8]:
ratings = pd.read_csv(
    "data/u1_base.txt", names=["user_id", "movie_id", "rating"], sep="\t"
)
generos = pd.read_csv("data/genre.txt", names=["genre_id", "genre_name"], sep="\t")
all_genre = generos.genre_name.values.tolist()
all_genre = ["movie_id"] + all_genre + ["title"]
films_df = pd.read_csv("data/items.txt",encoding="iso-8859-1" ,names=all_genre, sep="\t")

In [9]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
79995,943,1067,2
79996,943,1074,4
79997,943,1188,3
79998,943,1228,3


In [94]:
preferencias = []
for u in ratings.user_id.unique().tolist():
    best_genres = []
    for i in generos['genre_name'].tolist():
        puntos = genre_seen(films_df, ratings, u, i)
        if puntos > 2.5:
            best_genres.append((i, puntos))

    best_genres.sort(key=lambda a: a[1], reverse=True)
    best_genres = best_genres[:6]
    pref = []
    
    for genero in generos['genre_name'].tolist():
        add = False
        for bg in best_genres:
            if genero == bg[0]:
                add = True
                break
        if add:
            pref.append(1)
        else:
            pref.append(0)

    preferencias.append(pref)

In [15]:
pref_hyb = np.load('data/preferencias_hibrido.npz')['a']
pref_dg = np.load('data/preferencias_demografico.npz')['a']
preferencias = np.load('data/preferencias.npz')['a']

In [59]:
get_recommendation(0, ratings, preferencias)

[249, 50, 134, 302, 246]

In [8]:
obtener_vecinos(preferencias, 100, 3)

([698, 834, 45], [19, 19, 17])

In [37]:
pref_dg = get_dg_pref(grupos_demograficos, generos, films_df, ratings)

In [76]:
pref_hyb = []
for u in ratings.user_id.unique().tolist():
    pref = np.round(np.sum(np.matrix([preferencias['a'][u-1], pref_dg['a'][grupos_demograficos[u]-1]]), axis=0)/2, 0).tolist()[0]
    pref_hyb.append(pref)

pref_hyb = np.matrix(pref_hyb)

In [None]:
puntuaciones={}
for user_id in users_df.user_id.unique().tolist():
    puntuaciones[user_id] = {}
    print(user_id)
    dem_group = grupos_demograficos[user_id]
    for movie_id in films_df.movie_id.unique().tolist():
        v, r, c = movie_votes_demographic(grupos_demograficos, ratings, movie_id, dem_group)
        puntuaciones[user_id][movie_id] = v/((v+5)*r) + ((5/(5+v)) * c)

In [None]:
from recomendador import Recomendador
Recomendador()