In [93]:
# Recomendação - Rosie
## Ensinando
### Não sei

In [94]:
import pandas as pd
import os
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import argparse
import gc
import time
import numpy as np
from fuzzywuzzy import fuzz

In [95]:
# Preparação de dados

def _prep_data(df_movies,df_ratings,movie_rating_thres, user_rating_thres):
    df_movies = pd.read_csv('movies.csv',usecols=['movieId', 'title'],dtype={'movieId': 'int32', 'title': 'str'})
    df_ratings = pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
    # conta o numero de avaliações do filme
    df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(),columns=['count'])
    


#          defina o limite de frequência de classificação para filtrar filmes menos conhecidos e
#          usuários menos ativos
#          Parâmetros
#          ----------
#          movie_rating_thres: int, número mínimo de classificações recebidas pelos usuários
#          user_rating_thres: int, número mínimo de classificações que um usuário fornece



    # noqa
    popular_movies = list(set(df_movies_cnt.query('count >= @movie_rating_thres').index))          
    movies_filter = df_ratings.movieId.isin(popular_movies).values

    df_users_cnt = pd.DataFrame(df_ratings.groupby('userId').size(),columns=['count'])
    
    # noqa
    active_users = list(set(df_users_cnt.query('count >= @user_rating_thres').index))  
    
    users_filter = df_ratings.userId.isin(active_users).values

    df_ratings_filtered = df_ratings[movies_filter & users_filter]

    # pivot and create movie-user matrix
    movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    # create mapper from movie title to index


    hashmap = {
        movie: 
        i for i, movie in enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
        }
        
    # for i,movie in enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)):
    #     if movie == 'Toy Story (1995)':
    #         print(i,movie)
        
    # print(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
    # transform matrix to scipy sparse matrix
    movie_user_mat_sparse = csr_matrix(movie_user_mat.values)    

    # clean up
    del df_movies, df_movies_cnt, df_users_cnt
    del df_ratings, df_ratings_filtered, movie_user_mat
    gc.collect()

    return movie_user_mat_sparse, hashmap



In [96]:
# Inferência
def _inference(distances,indices):

    print('Sistema de recomendação começa a fazer inferência')
    print('......\n')
    t0 = time.time()

    # get list of raw idx of recommendations
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]

    print('O meu sistema {: .2f} s fez inferência \n\
              '.format(time.time() - t0))

    return raw_recommends


# Recomendação
def make_recommendations(hashmap,raw_recommends):
    filmesRecomendados = []
    
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    
    for i, (idx, dist) in enumerate(raw_recommends):      
        print('{0}: {1}, with distance '
         'of {2}'.format(i+1, reverse_hashmap[idx], dist))

        filmesRecomendados.append(reverse_hashmap[idx])

    return filmesRecomendados

In [97]:
# KNN
def Knn(data,idx,n_recommendations):
    model = NearestNeighbors(n_neighbors=20,algorithm='brute',metric='cosine')
    model.fit(data)
    distances, indices = model.kneighbors(data[idx],n_neighbors=n_recommendations+1)
    return [distances,indices]

# Fuzzy , se o filme existe
def _fuzzy_matching(hashmap, fav_movie):
    match_tuple = []
    # get match
    for title, idx in hashmap.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
        # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
    else:
        return match_tuple[0][1]
        


## Knn Recommender

In [117]:
# args = parse_args()

PATH = os.getcwd()

movies = pd.read_csv(PATH+'/movies.csv')
ratings = pd.read_csv(PATH+'/ratings.csv')
tags = pd.read_csv(PATH+'/tags.csv')

top_n = 20 # top n movie recommendations
movie_name = 'Nixon (1995)'

movie_user_mat_sparse, hashmap = _prep_data(movies,ratings,1,1)
#hashmap
#m
idx = _fuzzy_matching(hashmap,movie_name)
idx
distances, indices = Knn(movie_user_mat_sparse,idx,top_n)

raw_recommends = _inference(distances,indices)
#raw_recommends
movies_recommender = make_recommendations(hashmap,raw_recommends)
#movies_recommender




Sistema de recomendação começa a fazer inferência
......

O meu sistema  0.00 s fez inferência 
              
1: Single Girl, A (Fille seule, La) (1995), with distance of 0.697386264801
2: Last Klezmer: Leopold Kozlowski, His Life and Music, The (1994), with distance of 0.697386264801
3: Hamlet (1948), with distance of 0.697386205196
4: Cobb (1994), with distance of 0.688944339752
5: Very Brady Sequel, A (1996), with distance of 0.687203228474
6: Restoration (1995), with distance of 0.686795949936
7: Screamers (1995), with distance of 0.686735868454
8: Mr. Holland's Opus (1995), with distance of 0.686266899109
9: Up Close and Personal (1996), with distance of 0.679286241531
10: Eraser (1996), with distance of 0.678920030594
11: Mother (1996), with distance of 0.671063065529
12: Sense and Sensibility (1995), with distance of 0.67001247406
13: Anne Frank Remembered (1995), with distance of 0.658816874027
14: Time to Kill, A (1996), with distance of 0.655674099922
15: Crucible, The (1996

## Knn Recommender Tag

In [99]:
#filmes_tags = pd.DataFrame(data=filmes_tags,columns=['movieId','tags'])
def _prep_data2(movies_recommender,movie):
    movies_recommender.append(movie)
    
    df_tags = pd.read_csv('tags.csv',usecols=['movieId', 'tag'],dtype={'movieId': 'int32', 'tag': 'str'})
    df_movies = pd.read_csv('movies.csv',usecols=['movieId', 'title'],dtype={'movieId': 'int32', 'title': 'str'})
    
    filmes_tags = []
    
    for movie in movies_recommender:
        idfilme = df_movies[df_movies['title'] == movie].loc[:, 'movieId'].to_numpy()
        # print('tags',df_tags[df_tags['movieId']==idfilme[0]])
        for ii in df_tags[df_tags['movieId'] == idfilme[0]].to_numpy():
            filmes_tags.append(ii)
    return filmes_tags


In [100]:


def org(filmes_tags):
    ft = np.array(filmes_tags)
    z  = np.zeros((len(np.unique(ft[:, 0])), len(np.unique(ft[:, 1]))))

    df = pd.DataFrame(data=z, index=np.unique(ft[:, 0]), columns=np.unique(ft[:, 1]))
    
    return df


# Inferência
def inferenceTags(distances,indices):

    print('Sistema de recomendação começa a fazer inferência')
    print('......\n')
    t0 = time.time()


    raw_recommend = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
   
    return raw_recommend




In [101]:
 # Fuzzy , se o filme existe
def encontrarId(hashmap, fav_movie):  
     
    for movieId, idx in hashmap.items():
        if(movieId==fav_movie):
            print(idx,movieId)
            return idx               
# KNN
def KnnTag(data,idx,n_recommendations):
    model = NearestNeighbors(n_neighbors=20,algorithm='brute',metric='cosine')
    model.fit(data)
    distances, indices = model.kneighbors(data[idx],n_neighbors=n_recommendations+1)
    return [distances,indices]


In [108]:
def contem(filmes_tags,df):
    filmes_tags = pd.DataFrame(data=filmes_tags,columns=['movieId','tags'])
    t = filmes_tags
    hashmapTags=[]
    
    for i in t.index:       
        try:
            df.loc[t.loc[i]['movieId'],t.loc[i]['tags']] += 1
            #print("tem",df.loc[t.loc[i]['movieId'],t.loc[i]['tags']])
        except:            
            print("nao tem", df.loc[t.loc[i]['movieId'],t.loc[i]['tags']])                    




    a= filmes_tags.groupby('movieId').apply(list)
    hashmapTags = {
        movieId: 
        i for i, movieId in enumerate(a.keys())
        }     

    # hashmapTags = {
    #     movieId: 
    #     i for i, movieId in enumerate(list(filmes_tags.movieId),0)
    #     }     
        
    matriz_tags_filmes = csr_matrix(df.values)            
    
    return hashmapTags,df,matriz_tags_filmes



In [103]:
# Recomendação
def lista_recomendacao(hashmap,raw_recommend,df_movies):
    filmesRecomendados = []
    
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    # print(reverse_hashmap)
    for i, (idx, dist) in enumerate(raw_recommend):
        name_filme= df_movies.loc[df_movies['movieId'] == reverse_hashmap[idx]]['title']      
        print('{0}: {1}, distancia '
          '{2}'.format(i+1,str(name_filme), dist))        

        #filmesRecomendados.append(reverse_hashmap[idx])

   # return filmesRecomendados

In [118]:
df_movies = pd.read_csv('movies.csv',usecols=['movieId', 'title'],dtype={'movieId': 'int32', 'title': 'str'})
#movie_name = 'Iron Man'
top_n_tag = 5 # top n movie recommendations
filmes_tags = _prep_data2(movies_recommender,movie_name)

df = org(filmes_tags)

idMovie = df_movies.loc[df_movies['title'] == movie_name]['movieId']

filmes_tags = pd.DataFrame(data=filmes_tags,columns=['movieId','tags'])

hashmapTags,dd, matriz_tags_filmes = contem(filmes_tags,df)
hashmapTags
idx=encontrarId(hashmapTags,float(idMovie))

distances, indices = KnnTag(matriz_tags_filmes,idx,top_n_tag)
distances, indices

raw_recommend = inferenceTags(distances,indices)
raw_recommend
lista_recomendacao(hashmapTags,raw_recommend,df_movies)
#movies_recommender

(0, 14L)
Sistema de recomendação começa a fazer inferência
......

1: 533    Primal Fear (1996)
Name: title, dtype: object, distancia 0.958481733783
2: 33    Dead Man Walking (1995)
Name: title, dtype: object, distancia 0.943863910858
3: 639    Very Brady Sequel, A (1996)
Name: title, dtype: object, distancia 0.942833804952
4: 39    Restoration (1995)
Name: title, dtype: object, distancia 0.914250707429
5: 88    City Hall (1996)
Name: title, dtype: object, distancia 0.87035925529
