# Description Based Recommender

In [1]:
import pandas as pd
import numpy as np

- Representar texto como quantidades matemáticas
    - CountVectorizer 
    - TfidfVectorizer
- Métricas de similaridade
    - Exemplos: Distância Euclidiana, Correlação de Pearson, Similaridade de Cosseno.
     <img src="../../imgs/cosseno.png">

In [2]:
#lê  dados limpos
df = pd.read_csv('../../data/metadata_clean1.csv')
df.head(3)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995


In [3]:
#Lê dados originais
orig_df = pd.read_csv('../../data/movies_metadata.csv', low_memory=False).head(20000)

#adicionar features uteis ao dados limpos
df['overview'], df['id'] = orig_df['overview'], orig_df['id']

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Definir um TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN com string vazia
df['overview'] = df['overview'].fillna('')

#Construir matrix TFIDF
tfidf_matrix = tfidf.fit_transform(df['overview'])

#shape da tfidf_matrix
tfidf_matrix.shape

(20000, 47487)

In [5]:
from sklearn.metrics.pairwise import linear_kernel

# Computar a similaridade de coseno
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
#Construir um mapeamento dos indices e titulos de filmes
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [9]:
# Função para recomendar filmes baseado no título
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtem o indice do filme do titulo de entrada
    idx = indices[title]

    # obter similaridade de todos os filmes com a entrada
    sim_scores = list(enumerate(cosine_sim[idx]))

    # ordenar baseado no score da similaridade de coseno
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # obter scores dos 10 filmes mais similares
    sim_scores = sim_scores[1:11]

    # obtém os índices
    movie_indices = [i[0] for i in sim_scores]

    # retorna os dez mais similares
    return df['title'].iloc[movie_indices]

In [10]:
#obtém recommendações
content_recommender('The Lion King')

9353                         The Lion King 1½
9115           The Lion King 2: Simba's Pride
17041                            African Cats
6094                                Born Free
3203                         The Waiting Game
14402    Michael Jackson: Life of a Superstar
6574            Once Upon a Time in China III
3293                                 The Bear
2779                    Napoleon and Samantha
11507                     David and Bathsheba
Name: title, dtype: object

# Metadata Based Recommender

In [11]:
# Ler arquivos de informações de pessoal do file e informações de sub-categorias
cred_df = pd.read_csv('../../data/credits.csv')
key_df = pd.read_csv('../../data/keywords.csv')

In [12]:
cred_df.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


In [13]:
key_df.head(3)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [14]:
# Converter IDs não-inteiros para NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

df['id'] = df['id'].apply(clean_ids)

df = df[df['id'].notnull()]

In [15]:
# Converter IDs para inteiro
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

# Merge dos dados
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df['id'].astype('int')


Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [16]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [19]:
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [20]:
# Extrair nome do diretor
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [21]:
#Nova feature com nome do diretor
df['director'] = df['crew'].apply(get_director)

df['director'].head(3)

0    John Lasseter
1     Joe Johnston
2    Howard Deutch
Name: director, dtype: object

In [22]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Retorna no maximo 3 pessoas do cast
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [23]:
#Apply a generate_list function ao cast e keywords
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

In [26]:
df['keywords'].head(3)

0                                 [jealousy, toy, boy]
1    [board game, disappearance, based on children'...
2         [fishing, best friend, duringcreditsstinger]
Name: keywords, dtype: object

In [27]:
#Considera no maximo 3 tipos
df['genres'] = df['genres'].apply(lambda x: x[:3])
df['genres'].head(3)

0     [animation, comedy, family]
1    [adventure, fantasy, family]
2               [romance, comedy]
Name: genres, dtype: object

In [28]:
df[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[animation, comedy, family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[adventure, fantasy, family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[romance, comedy]"


In [29]:
# Função remove espaços and converte para lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [30]:
#junta todas strings como um texto
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df['soup'] = df.apply(create_soup, axis=1)

df.iloc[0]['soup']

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

#Computa a similaridade cosseno(equivalente ao produto  to dot product for tf-idf vectors)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [33]:
# Reseta indice e construi mapeamento
df = df.reset_index()
indices2 = pd.Series(df.index, index=df['title'])

In [34]:
content_recommender('The Lion King', cosine_sim2, df, indices2)

15147    Spiderman: The Ultimate Villain Showdown
16543                   Cirque du Soleil: Varekai
3328                            Creature Comforts
3489                                 Time Masters
3716                Thomas and the Magic Railroad
7058                                Teacher's Pet
1014                          So Dear to My Heart
2779                                   Thumbelina
4933                        The Flight of Dragons
16915                    The House of Small Cubes
Name: title, dtype: object