### Voronyi-Stepan-Camp-2025

In [45]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [48]:
metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [49]:
C = metadata['vote_average'].mean()
m = metadata['vote_count'].quantile(0.90)

qualified = metadata[metadata['vote_count'] >= m].copy()
print(f"Films considered for ranking: {qualified.shape[0]}")

Films considered for ranking: 4555


In [50]:
def imdb_weighted(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v + m)) * R + (m/(v + m)) * C

In [51]:
qualified['weighted_score'] = qualified.apply(imdb_weighted, axis=1)
top10_simple = qualified.sort_values('weighted_score', ascending=False).head(10)

print("Top 10 movies by IMDb weighted score:")
print(top10_simple[['title','vote_count','vote_average','weighted_score']])

Top 10 movies by IMDb weighted score:
                             title  vote_count  vote_average  weighted_score
314       The Shawshank Redemption      8358.0           8.5        8.445869
834                  The Godfather      6024.0           8.5        8.425439
10309  Dilwale Dulhania Le Jayenge       661.0           9.1        8.421453
12481              The Dark Knight     12269.0           8.3        8.265477
2843                    Fight Club      9678.0           8.3        8.256385
292                   Pulp Fiction      8670.0           8.3        8.251406
522               Schindler's List      4436.0           8.3        8.206639
23673                     Whiplash      4376.0           8.3        8.205404
5481                 Spirited Away      3968.0           8.3        8.196055
2211             Life Is Beautiful      3643.0           8.3        8.187171


In [52]:
subset = metadata.iloc[:8000].copy()
subset['overview'] = subset['overview'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_overview = tfidf.fit_transform(subset['overview'])
print(f"TF-IDF matrix (overview) shape: {tfidf_overview.shape}")

TF-IDF matrix (overview) shape: (8000, 28557)


In [54]:
sim_overview = linear_kernel(tfidf_overview, tfidf_overview)
overview_indices = pd.Series(subset.index, index=subset['title']).drop_duplicates()

In [55]:
def recommend_by_overview(title, topn=10):
    idx = overview_indices.get(title)
    if idx is None:
        return []
    scores = list(enumerate(sim_overview[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:topn+1]
    movie_ids = [i[0] for i in scores]
    return subset['title'].iloc[movie_ids].tolist()

print(recommend_by_overview('The Godfather'))

['The Godfather: Part II', 'The Godfather: Part III', 'Mobsters', 'Queen of Hearts', 'American Movie', 'Made', 'Soft Fruit', 'The Young Americans', 'Family Business', 'The Valachi Papers']


In [56]:
for df in [credits, keywords]:
    df['id'] = df['id'].astype(int)
metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce').astype('Int64')
metadata = metadata.dropna(subset=['id']).astype({'id':'int'})
metadata = metadata.merge(credits, on='id').merge(keywords, on='id')

for feature in ['cast','crew','keywords','genres']:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [57]:
def extract_director(crew_list):
    for member in crew_list:
        if member.get('job') == 'Director':
            return member.get('name')
    return ''

def extract_names(lst):
    names = [d.get('name','') for d in lst]
    return names[:3]

metadata['director'] = metadata['crew'].apply(extract_director)
for feat in ['cast','keywords','genres']:
    metadata[feat] = metadata[feat].apply(extract_names)

In [58]:
def clean(lst):
    return [i.replace(' ','').lower() for i in lst]
metadata['director'] = metadata['director'].str.replace(' ','').str.lower()
for feat in ['cast','keywords','genres']:
    metadata[feat] = metadata[feat].apply(clean)

metadata['soup'] = metadata.apply(
    lambda x: ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' +
              x['director'] + ' ' + ' '.join(x['genres']), axis=1)

In [59]:
small = metadata.iloc[:8000].copy()
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(small['soup'])

In [60]:
sim_soup = cosine_similarity(count_matrix, count_matrix)
indices_soup = pd.Series(small.index, index=small['title']).drop_duplicates()

In [61]:
def recommend_by_soup(title, topn=10):
    idx = indices_soup.get(title)
    if idx is None:
        return []
    sims = list(enumerate(sim_soup[idx]))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)[1:topn+1]
    ids = [i[0] for i in sims]
    return small['title'].iloc[ids].tolist()

In [62]:
print(recommend_by_soup('Fight Club'))

['Ill Gotten Gains', 'Jails, Hospitals & Hip-Hop', 'Lotto Land', 'The People vs. Larry Flynt', 'Other Voices Other Rooms', 'A River Runs Through It', 'Went to Coney Island on a Mission from God... Be Back by Five', 'Little Boy Blue', '25th Hour', 'Se7en']
