In [91]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [92]:
movies_df = pd.read_csv('data/movies_metadata.csv')
keywords_df = pd.read_csv('data/keywords.csv')
links_df = pd.read_csv('data/links.csv')

  movies_df = pd.read_csv('data/movies_metadata.csv')


In [93]:
movies = movies_df[['id', 'title', 'adult', 'genres', 'overview', 'popularity', 'vote_average', 'vote_count']].copy()
movies.head()

Unnamed: 0,id,title,adult,genres,overview,popularity,vote_average,vote_count
0,862,Toy Story,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",21.946943,7.7,5415.0
1,8844,Jumanji,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,17.015539,6.9,2413.0
2,15602,Grumpier Old Men,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,11.7129,6.5,92.0
3,31357,Waiting to Exhale,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",3.859495,6.1,34.0
4,11862,Father of the Bride Part II,False,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,8.387519,5.7,173.0


In [94]:

links = links_df[['movieId', 'tmdbId']].copy()
links['movieId'] = links['movieId'].astype(np.int64)
links['tmdbId'] = pd.to_numeric(links['tmdbId'], errors='coerce').astype('Int64')

links.head()

Unnamed: 0,movieId,tmdbId
0,1,862
1,2,8844
2,3,15602
3,4,31357
4,5,11862


In [95]:
movies['id'] = pd.to_numeric(movies['id'], errors='coerce').astype('Int64')
movies = movies.merge(links, left_on='id', right_on='tmdbId', how='left')
movies.head()

Unnamed: 0,id,title,adult,genres,overview,popularity,vote_average,vote_count,movieId,tmdbId
0,862,Toy Story,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",21.946943,7.7,5415.0,1,862
1,8844,Jumanji,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,17.015539,6.9,2413.0,2,8844
2,15602,Grumpier Old Men,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,11.7129,6.5,92.0,3,15602
3,31357,Waiting to Exhale,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",3.859495,6.1,34.0,4,31357
4,11862,Father of the Bride Part II,False,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,8.387519,5.7,173.0,5,11862


In [96]:
movies.drop(columns=['tmdbId'], inplace=True)
movies.head()

Unnamed: 0,id,title,adult,genres,overview,popularity,vote_average,vote_count,movieId
0,862,Toy Story,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",21.946943,7.7,5415.0,1
1,8844,Jumanji,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,17.015539,6.9,2413.0,2
2,15602,Grumpier Old Men,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,11.7129,6.5,92.0,3
3,31357,Waiting to Exhale,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",3.859495,6.1,34.0,4
4,11862,Father of the Bride Part II,False,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,8.387519,5.7,173.0,5


In [97]:
movies['genres'] = movies['genres'].fillna('[]').apply(eval).apply(lambda x: [y['name'] for y in x] if isinstance(x, list) else [])
movies.head()

Unnamed: 0,id,title,adult,genres,overview,popularity,vote_average,vote_count,movieId
0,862,Toy Story,False,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",21.946943,7.7,5415.0,1
1,8844,Jumanji,False,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,17.015539,6.9,2413.0,2
2,15602,Grumpier Old Men,False,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,11.7129,6.5,92.0,3
3,31357,Waiting to Exhale,False,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",3.859495,6.1,34.0,4
4,11862,Father of the Bride Part II,False,[Comedy],Just when George Banks has recovered from his ...,8.387519,5.7,173.0,5


In [98]:
keywords_df['keywords'] = keywords_df['keywords'].fillna('[]').apply(eval)
movies = movies.merge(keywords_df[['id', 'keywords']], left_on='id', right_on='id', how='left')
movies.head()

Unnamed: 0,id,title,adult,genres,overview,popularity,vote_average,vote_count,movieId,keywords
0,862,Toy Story,False,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",21.946943,7.7,5415.0,1,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,False,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,17.015539,6.9,2413.0,2,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,False,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,11.7129,6.5,92.0,3,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,False,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",3.859495,6.1,34.0,4,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,False,[Comedy],Just when George Banks has recovered from his ...,8.387519,5.7,173.0,5,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [99]:
movies['keywords'] = movies['keywords'].apply(lambda x: [y['name'] for y in x] if isinstance(x, list) else [])
movies.head()

Unnamed: 0,id,title,adult,genres,overview,popularity,vote_average,vote_count,movieId,keywords
0,862,Toy Story,False,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",21.946943,7.7,5415.0,1,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,Jumanji,False,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,17.015539,6.9,2413.0,2,"[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,False,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,11.7129,6.5,92.0,3,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,Waiting to Exhale,False,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",3.859495,6.1,34.0,4,"[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,False,[Comedy],Just when George Banks has recovered from his ...,8.387519,5.7,173.0,5,"[baby, midlife crisis, confidence, aging, daug..."


In [100]:
movies.isna().sum()

id              657
title           661
adult             0
genres            0
overview        995
popularity      442
vote_average    661
vote_count      661
movieId           0
keywords          0
dtype: int64

In [101]:
# delete rows with missing 'id' or 'title'
movies.dropna(subset=['id', 'title'], inplace=True)
movies.isna().sum()

id                0
title             0
adult             0
genres            0
overview        995
popularity        0
vote_average      0
vote_count        0
movieId           0
keywords          0
dtype: int64

In [102]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
movies['overview'] = movies['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['overview'])

In [103]:
mlb_genres = MultiLabelBinarizer()
genres_m = mlb_genres.fit_transform(movies['genres'])

In [104]:
mlb_keywords = MultiLabelBinarizer()
keywords_m = mlb_keywords.fit_transform(movies['keywords'])

In [105]:
adult_mask = movies['adult'].fillna('False').map({'True': 1, 'False': 0}).values.reshape(-1, 1)

In [106]:
numeric_features = movies[['popularity', 'vote_average', 'vote_count']].fillna(0)
numeric_features['vote_count'] = np.log1p(numeric_features['vote_count'])

In [107]:
numeric_features['popularity'] = pd.to_numeric(numeric_features['popularity'], errors='coerce').fillna(0)

In [108]:
scaler = StandardScaler()
numeric_m = scaler.fit_transform(numeric_features)

In [109]:
from scipy.sparse import hstack
X = hstack([tfidf_matrix, genres_m, keywords_m, adult_mask, numeric_m])

In [110]:
k = 20
nn = NearestNeighbors(n_neighbors=k, metric='cosine').fit(X)
distances, neighbors = nn.kneighbors(X)

In [None]:
edge_list = []

for i in range(len(movies)):
    movie_i_id = movies.iloc[i]['id']

    for idx, j in enumerate(neighbors[i]):
        if i == j:
            continue
        
        movie_j_id = movies.iloc[j]['id']
        dist = distances[i][idx]
        weight = 1 - dist 
        
        edge_list.append((movie_i_id, movie_j_id, weight))

edges_df = pd.DataFrame(edge_list, columns=['source', 'target', 'weight'])
edges_df.to_csv('data/movie_similarity_graph.csv', index=False)

In [115]:
n = X.shape[0]
rhos = np.zeros(n)
sigmas = np.zeros(n)

for i in range(n):
    d = distances[i]
    d_nonzero = d[d > 0]
    rhos[i] = np.min(d_nonzero) if len(d_nonzero) else 0
    target = np.log2(k)

    low, high = 1e-3, 10

    for _ in range(50):
        mid = (low + high) / 2
        sum_w = np.sum(np.exp(-(d - rhos[i]) / mid))
        if abs(sum_w - target) < 1e-3:
            break
        if sum_w > target:
            high = mid
        else:
            low = mid
    
    sigmas[i] = mid

rows = []
for i in range(n):
    movie_i = movies.iloc[i]['id']
    for idx, j in enumerate(neighbors[i]):
        if i == j: continue
        movie_j = movies.iloc[j]['id']
        d = distances[i][idx]
        w = np.exp(-(d - rhos[i]) / sigmas[i])
        rows.append((movie_i, movie_j, w))

df = pd.DataFrame(rows, columns=['source', 'target', 'weight'])


df_rev = df.rename(columns={'source':'target', 'target':'source', 'weight':'w_rev'})
dfm = df.merge(df_rev, on=['source','target'], how='outer').fillna(0)
dfm['weight'] = 1 - (1 - dfm['weight']) * (1 - dfm['w_rev'])

edges_final = dfm[['source', 'target', 'weight']]
edges_final = edges_final[edges_final['source'] != edges_final['target']]

edges_final.to_csv("data/umap_movie_graph.csv", index=False)

  sum_w = np.sum(np.exp(-(d - rhos[i]) / mid))


In [116]:
edges_final = edges_final[edges_final['weight'] > 0]
edges_final.to_csv("processed_data/umap_movie_graph_truncated.csv", index=False)

In [114]:
# Statistics about the constructed graph
num_nodes = len(movies)
num_edges = len(edge_list)
print(f'Number of nodes: {num_nodes}')
print(f'Number of edges: {num_edges}')

avg_degree = (2 * num_edges) / num_nodes
print(f'Average degree: {avg_degree}')
degrees = {}
for edge in edge_list:
    source, target, weight = edge
    degrees[source] = degrees.get(source, 0) + 1
    degrees[target] = degrees.get(target, 0) + 1

max_degree = max(degrees.values())
print(f'Maximum degree: {max_degree}')

Number of nodes: 46611
Number of edges: 885616
Average degree: 38.00030035828453
Maximum degree: 2903


id              0
title           0
adult           0
genres          0
overview        0
popularity      0
vote_average    0
vote_count      0
movieId         0
keywords        0
dtype: int64

In [87]:
movies.to_csv('data/movies_processed.csv', index=False)

In [88]:
# Neighbors of movie with id 862
movie_id = 862

neighbors_list = []
for edge in edges:
    source, target = edge
    if source == movie_id:
        neighbors_list.append(target)
    elif target == movie_id:
        neighbors_list.append(source)

for neighbor_id in neighbors_list:
    title = movies[movies['id'] == neighbor_id]['title'].values[0]
    print(f'Neighbor Movie ID: {neighbor_id}, Title: {title}')

Neighbor Movie ID: 260514, Title: Cars 3
Neighbor Movie ID: 1267, Title: Meet the Robinsons
Neighbor Movie ID: 10193, Title: Toy Story 3
Neighbor Movie ID: 10957, Title: The Black Cauldron
Neighbor Movie ID: 11356, Title: The Odd Couple
Neighbor Movie ID: 35554, Title: Cado dalle nubi
Neighbor Movie ID: 10681, Title: WALL·E
Neighbor Movie ID: 77950, Title: Turbo
Neighbor Movie ID: 136799, Title: Trolls
Neighbor Movie ID: 21705, Title: Barbie Diaries
Neighbor Movie ID: 888, Title: The Flintstones
Neighbor Movie ID: 109439, Title: The Hangover Part III
Neighbor Movie ID: 863, Title: Toy Story 2
Neighbor Movie ID: 228161, Title: Home
Neighbor Movie ID: 9297, Title: Monster House
Neighbor Movie ID: 11381, Title: Tommy Boy
Neighbor Movie ID: 256835, Title: Toy Story That Time Forgot
Neighbor Movie ID: 10681, Title: WALL·E
Neighbor Movie ID: 271718, Title: Trainwreck
Neighbor Movie ID: 324852, Title: Despicable Me 3
Neighbor Movie ID: 72105, Title: Ted
Neighbor Movie ID: 10527, Title: Madaga