In [17]:
import numpy as np
import pandas as pd
import re

In [18]:
movies_df = pd.read_csv("../movies.csv")
# ratings_df = pd.read_csv("../ratings.csv")
links_df = pd.read_csv("../links.csv")
tags_df = pd.read_csv("../tags.csv")
genome_tags = pd.read_csv("../genome-tags.csv")
genome_score = pd.read_csv("../genome-scores.csv")

In [19]:
genome_tags = genome_tags[~genome_tags['tag'].isin(['original', 'sequel', 'good sequel','sequels'])]

In [20]:
merged = pd.merge(genome_score, genome_tags, on='tagId')

# Group the data by 'movieId' and apply a lambda function to get the top 2 tags for each movie
top_tags = merged.groupby('movieId').apply(lambda x: x.nlargest(5, 'relevance')['tag'].tolist())

# Convert the result to a dataframe
top_tags_df = top_tags.reset_index(name='top_relevance')

In [21]:
movies = pd.merge(top_tags_df, movies_df[['movieId', 'title','genres']], on='movieId')
movies = movies.fillna(' ')

In [22]:
movies['genres'] = movies['genres'].apply(lambda x: re.sub(r'\|', ' ', x).lower())
movies['top_relevance'] = movies['top_relevance'].apply(lambda x: ' '.join(x).replace('-',' '))

In [90]:
movies[movies['title'] == 'Zombieland: Double Tap (2019)']

Unnamed: 0,movieId,top_relevance,title,genres,combine_relevant,clean_title
13811,205072,dumb but funny friendship runaway great movie ...,Zombieland: Double Tap (2019),action comedy horror,dumb but funny friendship runaway great movie ...,Zombieland Double Tap 2019


In [24]:
movies['combine_relevant'] = movies['top_relevance']+" "+movies['genres']

In [25]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [28]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

counter_vec = CountVectorizer(stop_words='english',max_features=1500)
counter_vec.fit(movies['combine_relevant'])

gerne_vec_tags = counter_vec.transform(movies['top_relevance'])*0.3 # change the importance
gerne_vec_geners = counter_vec.transform(movies['genres'])*1 

gerne_vec = gerne_vec_tags+gerne_vec_geners

cos_similar = cosine_similarity(gerne_vec,gerne_vec)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [56]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results['title'].iloc[0]

In [54]:
movie_title_series = pd.Series(movies.index,movies['title'])

def get_recommend(title,cosine_sim = cos_similar):
    title = search(str(title))
    movie_name = movie_title_series[title]
    
    sim_scores = list(enumerate(cosine_sim[movie_name]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]  

In [58]:
results = search(str("Avengers"))
results

'Avengers: Age of Ultron (2015)'

In [61]:
get_recommend("Toy story") 

8583                                 Ant Bully, The (2006)
10290    Shrek Forever After (a.k.a. Shrek: The Final C...
4407                                 Monsters, Inc. (2001)
11932            Toy Story Toons: Hawaiian Vacation (2011)
8717                                     Happy Feet (2006)
10680                                        Cars 2 (2011)
0                                         Toy Story (1995)
2807                                    Toy Story 2 (1999)
8939                                Shrek the Third (2007)
11933                    Toy Story Toons: Small Fry (2011)
Name: title, dtype: object

In [None]:
movies[movies['title']=='Father of the Bride Part II (1995)']

Unnamed: 0,movieId,top_relevance,title,genres,combine_relevant
4,5,father daughter relationship pregnancy midlife...,Father of the Bride Part II (1995),comedy,father daughter relationship pregnancy midlife...


In [86]:
user_list_movie=['Zombieland: Double Tap (2019)','Zombieland (2009)','Shaun of the Dead (2004)','Cockneys vs Zombies (2012)']

In [88]:
def get_recommend_by_user_list(user_list):
    feature_names = list(counter_vec.vocabulary_.keys())
    user_vec = np.zeros((1, len(feature_names)))
    for movie_name in user_list:
        vector_tmp = counter_vec.transform( movies[movies['title'] == movie_name]['combine_relevant'])
        user_vec+=vector_tmp.toarray()

    cosine = cosine_similarity(gerne_vec,user_vec)

    sim_scores = list(enumerate(cosine))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[0:len(user_list)+11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

In [87]:
get_recommend_by_user_list(user_list_movie)

11258    Juan of the Dead (Juan de los Muertos) (2011)
11210                       Cockneys vs Zombies (2012)
9966                                 Zombieland (2009)
12548     Scouts Guide to the Zombie Apocalypse (2015)
7098                          Shaun of the Dead (2004)
5249               Chopper Chicks in Zombietown (1989)
4075          Return of the Living Dead Part II (1988)
10053                                  Doghouse (2009)
12563                                   Cooties (2015)
10499                     Tucker & Dale vs Evil (2010)
11957                Dead Snow 2: Red vs. Dead (2014) 
10898                             Revenant, The (2009)
2327                                 Idle Hands (1999)
13811                    Zombieland: Double Tap (2019)
13364                               Little Evil (2017)
Name: title, dtype: object

In [89]:
def precision_at_k(actual, predicted, k):
    actual_set = set(actual)
    predicted_k = predicted[:k]
    correct_predictions = len(actual_set.intersection(predicted_k))
    precision = correct_predictions / k if k > 0 else 0
    return precision

def recall_at_k(actual, predicted, k):
    actual_set = set(actual)
    predicted_k = predicted[:k]
    correct_predictions = len(actual_set.intersection(predicted_k))
    recall = correct_predictions / len(actual_set) if len(actual_set) > 0 else 0
    return recall

def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

recommended_list = get_recommend_by_user_list(user_list_movie)
# Set the value of K for precision at K and recall at K
k_value = 5

precision = precision_at_k(user_list_movie, recommended_list, k_value)
recall = recall_at_k(user_list_movie, recommended_list, k_value)
f1 = f1_score(precision, recall)

print("Evaluation of list movie that user want recommendations")
print(f"Precision at {k_value}: {precision:.2f}")
print(f"Recall at {k_value}: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Evaluation of list movie that user want recommendations
Precision at 5: 0.60
Recall at 5: 0.75
F1 Score: 0.67
