In [44]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [45]:
credits = pd.read_csv('/kaggle/input/the-movies-dataset/credits.csv')
keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv').\
                     drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1).\
                     drop([19730, 29503, 35587])

In [46]:
movies['id'] = movies['id'].astype('int64')
df = movies.merge(keywords, on='id').\
     merge(credits, on='id')
df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')

df.dropna(inplace=True)

In [47]:
from ast import literal_eval

def get_text(text, obj='name'):
    text = literal_eval(text)

    if len(text) == 1:
        for i in text:
            return i[obj]
    else:
        s = []
        for i in text:
            s.append(i[obj])
        return ', '.join(s)

df['genres'] = df['genres'].apply(get_text)
df['production_companies'] = df['production_companies'].apply(get_text)
df['production_countries'] = df['production_countries'].apply(get_text)
df['crew'] = df['crew'].apply(get_text)
df['spoken_languages'] = df['spoken_languages'].apply(get_text)
df['keywords'] = df['keywords'].apply(get_text)

# New columns
df['characters'] = df['cast'].apply(get_text, obj='character')
df['actors'] = df['cast'].apply(get_text)

df.drop('cast', axis=1, inplace=True)
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)

In [48]:
df.head(3)

Unnamed: 0,adult,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,...,revenue,runtime,spoken_languages,tagline,vote_average,vote_count,keywords,crew,characters,actors
0,False,30000000,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,United States of America,...,373554033.0,81.0,English,,7.7,5415.0,"jealousy, toy, boy, friendship, friends, rival...","John Lasseter, Joss Whedon, Andrew Stanton, Jo...","Woody (voice), Buzz Lightyear (voice), Mr. Pot...","Tom Hanks, Tim Allen, Don Rickles, Jim Varney,..."
1,False,65000000,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,...,262797249.0,104.0,"English, Français",Roll the dice and unleash the excitement!,6.9,2413.0,"board game, disappearance, based on children's...","Larry J. Franco, Jonathan Hensleigh, James Hor...","Alan Parrish, Samuel Alan Parrish / Van Pelt, ...","Robin Williams, Jonathan Hyde, Kirsten Dunst, ..."
2,False,0,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"Warner Bros., Lancaster Gate",United States of America,...,0.0,101.0,English,Still Yelling. Still Fighting. Still Ready for...,6.5,92.0,"fishing, best friend, duringcreditsstinger, ol...","Howard Deutch, Mark Steven Johnson, Mark Steve...","Max Goldman, John Gustafson, Ariel Gustafson, ...","Walter Matthau, Jack Lemmon, Ann-Margret, Soph..."


In [49]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['budget'] = df['budget'].astype('float64')
df['popularity'] = df['popularity'].astype('float64')

In [50]:
R = df['vote_average']
v = df['vote_count']
m = df['vote_count'].quantile(0.8)
C = df['vote_average'].mean()

df['weighted_average'] = (R*v + C*m)/(v+m)

In [51]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[['popularity', 'weighted_average']])
weighted_df = pd.DataFrame(scaled, columns=['popularity', 'weighted_average'])

weighted_df.index = df['original_title']

In [52]:
weighted_df['score'] = weighted_df['weighted_average']*0.4 + weighted_df['popularity'].astype('float64')*0.6

In [53]:
weighted_df_sorted = weighted_df.sort_values(by='score', ascending=False)

In [54]:
hybrid_df = df[['original_title', 'adult', 'genres', 'overview', 'production_companies', 'tagline', 'keywords', 'crew', 'characters', 'actors']]

In [55]:
def separate(text):
    clean_text = []
    for t in text.split(','):
        cleaned = re.sub('\(.*\)', '', t) # Remove text inside parentheses
        cleaned = cleaned.translate(str.maketrans('','', string.digits))
        cleaned = cleaned.replace(' ', '')
        cleaned = cleaned.translate(str.maketrans('','', string.punctuation)).lower()
        clean_text.append(cleaned)
    return ' '.join(clean_text)

def remove_punc(text):
    cleaned = text.translate(str.maketrans('','', string.punctuation)).lower()
    clean_text = cleaned.translate(str.maketrans('','', string.digits))
    return clean_text

In [56]:
import string
import re

hybrid_df['adult'] = hybrid_df['adult'].apply(remove_punc)
hybrid_df['genres'] = hybrid_df['genres'].apply(remove_punc)
hybrid_df['overview'] = hybrid_df['overview'].apply(remove_punc)
hybrid_df['production_companies'] = hybrid_df['production_companies'].apply(separate)
hybrid_df['tagline'] = hybrid_df['tagline'].apply(remove_punc)
hybrid_df['keywords'] = hybrid_df['keywords'].apply(separate)
hybrid_df['crew'] = hybrid_df['crew'].apply(separate)
hybrid_df['characters'] = hybrid_df['characters'].apply(separate)
hybrid_df['actors'] = hybrid_df['actors'].apply(separate)

hybrid_df['bag_of_words'] = ''
hybrid_df['bag_of_words'] = hybrid_df['original_title'] + " " + hybrid_df[hybrid_df.columns[1:]].apply(lambda x: ' '.join(x), axis=1)
hybrid_df.set_index('original_title', inplace=True)

hybrid_df = hybrid_df[['bag_of_words']]
hybrid_df.head()

Unnamed: 0_level_0,bag_of_words
original_title,Unnamed: 1_level_1
Toy Story,Toy Story false animation comedy family led by...
Jumanji,Jumanji false adventure fantasy family when si...
Grumpier Old Men,Grumpier Old Men false romance comedy a family...
Waiting to Exhale,Waiting to Exhale false comedy drama romance c...
Father of the Bride Part II,Father of the Bride Part II false comedy just ...


In [57]:
books = pd.read_csv('/kaggle/input/top2k-books-with-descriptions/top2k_book_descriptions.csv', index_col=0)

In [58]:
b = books

In [59]:
books['tag_name'] = books['tag_name'].apply(lambda x: literal_eval(x) if literal_eval(x) else np.nan)
books = books[books['description'].notnull() | books['tag_name'].notnull()]
books = books.fillna('')

In [60]:
def book_cat(x):
    cat = x['title'] +" "+ x["original_title"]+" "+x["description"]+" "+" ".join(x['tag_name'])+" "+x["authors"]
    return cat

In [61]:
books["bag_of_words"] = books.apply(book_cat, axis=1)

In [62]:
books.set_index('original_title', inplace=True)
books = books[['bag_of_words']]

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [64]:
tfidfB = TfidfVectorizer(stop_words='english', min_df=5)
tfidfB_matrix = tfidfB.fit_transform(books['bag_of_words'])
cos_simB = cosine_similarity(tfidfB_matrix)

In [65]:
hybrid_df = weighted_df_sorted[:10000].merge(hybrid_df, left_index=True, right_index=True, how='left')
tfidf = TfidfVectorizer(stop_words='english', min_df=5)
tfidf_matrix = tfidf.fit_transform(hybrid_df['bag_of_words'])
cos_sim = cosine_similarity(tfidf_matrix)

In [66]:
soups = pd.concat([hybrid_df['bag_of_words'],books['bag_of_words']],ignore_index=True)
count = CountVectorizer(stop_words = "english")
count.fit(soups)
movies_matrix = count.transform(hybrid_df['bag_of_words'])
books_matrix = count.transform(books['bag_of_words'])
cosine = cosine_similarity(movies_matrix,books_matrix)

In [91]:
def predict_book(title):
    m = hybrid_df.reset_index()
    indices = pd.Series(m.index, index=m['original_title'].apply(lambda x: x.lower() if x is not np.nan else "")).drop_duplicates()
    idx = indices[title.lower()]
    sim_scores = list(enumerate(cosine[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    sim_scores = sim_scores[:10]

    book_indices = [i[0] for i in sim_scores]
    index_book = books.index.get_loc(books.iloc[book_indices].index[0])
    similarity = cos_simB[index_book].T
    sim_df = pd.DataFrame(similarity, columns=['similarity'])
    final_df = pd.concat([b, sim_df], axis=1)
    final_df_filtered = final_df[final_df['similarity'] >= 0.3]

    if final_df_filtered.empty:
        return "No books available"
    else:
        final_df_sorted = final_df_filtered.sort_values(by='similarity', ascending=False)
        final_df_sorted.set_index('title', inplace=True)
        return final_df_sorted[['similarity']]

In [92]:
def predict_movie(title, similarity_weight=0.7, top_n=10):
    data = hybrid_df.reset_index()
    index_movie = data[data['original_title'] == title].index
    similarity = cos_sim[index_movie].T

    sim_df = pd.DataFrame(similarity, columns=['similarity'])
    final_df = pd.concat([data, sim_df], axis=1)
    final_df['final_score'] = final_df['score']*(1-similarity_weight) + final_df['similarity']*similarity_weight

    final_df_sorted = final_df.sort_values(by='final_score', ascending=False).head(top_n)
    final_df_sorted.set_index('original_title', inplace=True)
    return final_df_sorted[['similarity']]

In [93]:
just_finished = 'The Hunger Games'
predict_movie(just_finished, similarity_weight=0.7, top_n=10)

Unnamed: 0_level_0,similarity
original_title,Unnamed: 1_level_1
The Hunger Games,1.0
The Hunger Games: Catching Fire,0.404598
The Hunger Games: Mockingjay - Part 1,0.217419
The Hunger Games: Mockingjay - Part 2,0.265177
Minions,0.004104
Indie Game: The Movie,0.190304
Guardians of the Galaxy Vol. 2,0.023355
火垂るの墓,0.095664
Baby Driver,0.01125
Big Hero 6,0.000637


In [94]:
predict_book(just_finished)

Unnamed: 0_level_0,similarity
title,Unnamed: 1_level_1
"The Hunger Games (The Hunger Games, #1)",1.0
"The One (The Selection, #3)",0.621038
Revival,0.577312
"Mockingjay (The Hunger Games, #3)",0.518975
The Choice,0.359137
"Catching Fire (The Hunger Games, #2)",0.337412


In [95]:
just_finished = 'Harry Potter and the Chamber of Secrets'
predict_movie(just_finished, similarity_weight=0.7, top_n=10)

Unnamed: 0_level_0,similarity
original_title,Unnamed: 1_level_1
Harry Potter and the Chamber of Secrets,1.0
Harry Potter and the Philosopher's Stone,0.632853
Harry Potter and the Prisoner of Azkaban,0.604326
Harry Potter and the Goblet of Fire,0.504787
Harry Potter and the Order of the Phoenix,0.500463
Harry Potter and the Half-Blood Prince,0.472871
Harry Potter and the Deathly Hallows: Part 2,0.426642
Harry Potter and the Deathly Hallows: Part 1,0.403559
Minions,0.007622
A Very Potter Musical,0.148926


In [96]:
predict_book(just_finished)

Unnamed: 0_level_0,similarity
title,Unnamed: 1_level_1
"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",1.0
"Harry Potter Boxset (Harry Potter, #1-7)",0.624581
"Harry Potter and the Deathly Hallows (Harry Potter, #7)",0.571606
"Harry Potter and the Cursed Child - Parts One and Two (Harry Potter, #8)",0.56317
"Harry Potter and the Goblet of Fire (Harry Potter, #4)",0.552584
"Harry Potter and the Half-Blood Prince (Harry Potter, #6)",0.478154
"Harry Potter and the Order of the Phoenix (Harry Potter, #5)",0.332899
"Storm Front (The Dresden Files, #1)",0.320253


In [97]:
just_finished = 'The Lord of the Rings'
predict_movie(just_finished, similarity_weight=0.7, top_n=10)

Unnamed: 0_level_0,similarity
original_title,Unnamed: 1_level_1
The Lord of the Rings,1.0
The Lord of the Rings: The Fellowship of the Ring,0.388274
The Lord of the Rings: The Two Towers,0.37616
The Lord of the Rings: The Return of the King,0.280603
The Hobbit: An Unexpected Journey,0.29347
The Hobbit: The Battle of the Five Armies,0.249623
Minions,0.007537
The Hobbit,0.265034
The Hobbit: The Desolation of Smaug,0.211754
Big Hero 6,0.002285


In [98]:
predict_book(just_finished)

Unnamed: 0_level_0,similarity
title,Unnamed: 1_level_1
"The Two Towers (The Lord of the Rings, #2)",1.0
"The Return of the King (The Lord of the Rings, #3)",0.349337
