In [None]:
!pip install -U transformers sentence-transformers
!pip install tmdbv3api


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import numpy as np
import pandas as pd
import ast
from tmdbv3api import TMDb, Movie
from sentence_transformers import SentenceTransformer
from collections import Counter
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [None]:
# --- Load and preprocess dataset ---

def scrape_and_extend_dataset():
    from tmdbv3api import TMDb, Movie
    import requests
    from tqdm import tqdm
    tqdm.pandas()

    tmdb = TMDb()
    tmdb.api_key = 'b43b8e8ad53a7df2ab92c6bb80de9eec'
    movie_api = Movie()

    all_years = []
    for year in range(2018, 2026):
        url = f"https://en.wikipedia.org/wiki/List_of_American_films_of_{year}"
        try:
            tables = pd.read_html(url, header=0)
            for tbl in tables:
                if 'Title' in tbl and 'Cast and crew' in tbl:
                    all_years.append(tbl[['Title', 'Cast and crew']])
        except:
            pass

    wiki_df = pd.concat(all_years, ignore_index=True).dropna()

    # Fetch TMDb metadata
    def fetch_tmdb_details(title):
        try:
            search_results = movie_api.search(title)
            if not search_results:
                return None
            m = search_results[0]
            res = requests.get(f"https://api.themoviedb.org/3/movie/{m.id}?api_key={tmdb.api_key}").json()
            overview = res.get('overview', '')
            genres = [g['name'].replace(" ", "") for g in res.get('genres', [])]
            return {
                'overview': overview,
                'genres': genres
            }
        except:
            return None

    metadata = wiki_df['Title'].progress_map(fetch_tmdb_details)
    wiki_df['overview'] = metadata.map(lambda x: x['overview'].split() if x and x['overview'] else [])
    wiki_df['genres'] = metadata.map(lambda x: x['genres'] if x and x['genres'] else [])

    def get_director(x):
        if " (director)" in x: return x.split(" (director)")[0]
        if " (directors)" in x: return x.split(" (directors)")[0]
        if " (director/screenplay)" in x: return x.split(" (director/screenplay)")[0]
        return ""

    def get_actor(x, n):
        try:
            parts = x.split("screenplay); ")[-1].split(", ")
            return parts[n].replace(" ", "")
        except:
            return "unknown"

    wiki_df['director'] = wiki_df['Cast and crew'].map(get_director).str.replace(" ", "")
    wiki_df['actor_1'] = wiki_df['Cast and crew'].map(lambda x: get_actor(x, 0))
    wiki_df['actor_2'] = wiki_df['Cast and crew'].map(lambda x: get_actor(x, 1))
    wiki_df['actor_3'] = wiki_df['Cast and crew'].map(lambda x: get_actor(x, 2))

    wiki_df['keywords'] = [[]]*len(wiki_df)
    wiki_df['cast'] = wiki_df[['actor_1','actor_2','actor_3']].values.tolist()
    wiki_df['crew'] = wiki_df['director'].apply(lambda x: [x] if x else [])

    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer()
    def stem(text):
        return " ".join([ps.stem(word) for word in text.split()])

    wiki_df['tags'] = wiki_df['overview'] + wiki_df['genres'] + wiki_df['keywords'] + wiki_df['cast'] + wiki_df['crew']
    wiki_df['tags'] = wiki_df['tags'].apply(lambda x: " ".join(x).lower()).apply(stem)
    wiki_df['id'] = 100000 + wiki_df.index
    wiki_df = wiki_df[['id', 'Title', 'tags']].rename(columns={'Title': 'title'})

    return wiki_df


def load_and_preprocess(path_movies, path_credits):
    movies = pd.read_csv(path_movies)
    credits = pd.read_csv(path_credits)

    df = movies.merge(credits, on='title')
    df = df[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
    df.dropna(inplace=True)

    def convert(obj):
        return [i['name'] for i in ast.literal_eval(obj)]

    def convert_cast(obj):
        return [i['name'].replace(" ", "") for i in ast.literal_eval(obj)[:3]]

    def fetch_director(obj):
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                return [i['name'].replace(" ", "")]
        return []

    ps = PorterStemmer()

    def stem(text):
        return " ".join([ps.stem(word) for word in text.split()])

    df['genres'] = df['genres'].apply(convert)
    df['keywords'] = df['keywords'].apply(convert)
    df['cast'] = df['cast'].apply(convert_cast)
    df['crew'] = df['crew'].apply(fetch_director)
    df['overview'] = df['overview'].apply(lambda x: x.split())

    df['cast'] = df['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
    df['genres'] = df['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
    df['keywords'] = df['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
    df['crew'] = df['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

    df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']
    new_df = df[['id', 'title', 'tags']]
    new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower()).apply(stem)

    return new_df

# --- Build similarity matrix ---

# def build_similarity_matrix(new_df):
#     tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
#     vectors = tfidf.fit_transform(new_df['tags']).toarray()
#     similarity = cosine_similarity(vectors)
#     return similarity

def build_semantic_similarity_matrix(new_df):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and accurate
    print("Encoding movie tags with SentenceTransformer...")
    embeddings = model.encode(new_df['tags'].tolist(), show_progress_bar=True)
    similarity = cosine_similarity(embeddings)
    return similarity

# --- Recommend function ---

def recommend(movie, new_df, similarity):
    if movie not in new_df['title'].values:
        print("Movie not found.")
        return []
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    recommendations = [new_df.iloc[i[0]].title for i in movies_list]
    for title in recommendations:
        print(title)
    return recommendations

# --- recommend function ---

def recommend_for_user(user_liked_movies, new_df, similarity, top_k=5):
    combined_scores = np.zeros(len(new_df))
    liked_titles_set = set(user_liked_movies)

    for movie in user_liked_movies:
        if movie in new_df['title'].values:
            idx = new_df[new_df['title'] == movie].index[0]
            combined_scores += similarity[idx]

    combined_scores /= len(user_liked_movies)

    # Exclude liked movies from recommendations
    recommendations = sorted(
        list(enumerate(combined_scores)),
        key=lambda x: x[1],
        reverse=True
    )
    recommendations = [i for i in recommendations if new_df.iloc[i[0]].title not in liked_titles_set][:top_k]

    result = [new_df.iloc[i[0]].title for i in recommendations]
    print("\nFor You Recommendations:")
    for title in result:
        print(title)
    return result


# --- Evaluation function ---

from sklearn.metrics.pairwise import cosine_similarity

def evaluate_semantic_hits(user_liked_movies, new_df, embeddings, top_k=5, similarity_threshold=0.6):
    hit_count = 0
    total = 0

    for i in range(len(user_liked_movies)):
        test_movie = user_liked_movies[i]
        train_movies = user_liked_movies[:i] + user_liked_movies[i+1:]

        if test_movie not in new_df['title'].values:
            continue

        test_idx = new_df[new_df['title'] == test_movie].index[0]
        test_emb = embeddings[test_idx]

        # Get recommendation list
        recommended_titles = recommend_for_user(train_movies, new_df, embeddings, top_k)

        for title in recommended_titles:
            rec_idx = new_df[new_df['title'] == title].index[0]
            rec_emb = embeddings[rec_idx]

            sim = cosine_similarity([test_emb], [rec_emb])[0][0]

            if sim >= similarity_threshold:
                hit_count += 1

        total += top_k

    precision = hit_count / total if total > 0 else 0
    recall = hit_count / len(user_liked_movies)
    hit_rate = hit_count / len(user_liked_movies)

    print("\n--- Semantic Evaluation Metrics (Improved) ---")
    print(f"Hit Rate@{top_k}: {hit_rate:.4f}")
    print(f"Precision@{top_k}: {precision:.4f}")
    print(f"Recall@{top_k}: {recall:.4f}")




base_df = load_and_preprocess('tmdb_5000_movies.csv', 'tmdb_5000_credits.csv')
extra_df = scrape_and_extend_dataset()
new_df = pd.concat([base_df, extra_df], ignore_index=True).dropna()

# similarity = build_similarity_matrix(new_df)
similarity = build_semantic_similarity_matrix(new_df)


# Example simulated user
user_liked_movies = [
     'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice'
]

evaluate_semantic_hits(user_liked_movies, new_df, similarity, top_k=5, similarity_threshold=0.6)
#recommend('Harry Potter and the Chamber of Secrets', new_df, similarity)

recommend_for_user(user_liked_movies, new_df, similarity, top_k=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower()).apply(stem)
100%|██████████| 2615/2615 [07:22<00:00,  5.91it/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding movie tags with SentenceTransformer...


Batches:   0%|          | 0/232 [00:00<?, ?it/s]


For You Recommendations:
The Amazing Spider-Man 2
Mirror Mirror
Spider-Man 2
Mystery Men
Spider-Man

For You Recommendations:
Iron Man
Mirror Mirror
Mystery Men
Megamind
Hellboy II: The Golden Army

For You Recommendations:
Spider-Man 2
Superman Returns
The Amazing Spider-Man 2
Man of Steel
Fantastic 4: Rise of the Silver Surfer

For You Recommendations:
Mirror Mirror
Hellboy II: The Golden Army
Spider-Man 2
Batman Begins
Spider-Man

For You Recommendations:
Iron Man
Mystery Men
Spider-Man 2
Megamind
Ant-Man

For You Recommendations:
Mirror Mirror
Iron Man
Spider-Man 2
The Amazing Spider-Man 2
Spider-Man

--- Semantic Evaluation Metrics (Improved) ---
Hit Rate@5: 5.0000
Precision@5: 1.0000
Recall@5: 5.0000

For You Recommendations:
Spider-Man 2
Mirror Mirror
The Amazing Spider-Man 2
Iron Man
Mystery Men


['Spider-Man 2',
 'Mirror Mirror',
 'The Amazing Spider-Man 2',
 'Iron Man',
 'Mystery Men']

In [None]:
new_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
7416,102616,Zero A. D.,"befor king and empir fear him, a tyrant sought..."
7417,102617,Anaconda,a 'nation geographic' film crew is taken hosta...
7418,102618,The Housemaid,milli is a struggl woman who is reliev to get ...
7419,102619,Marty Supreme,"marti mauser, a young man with a dream no one ..."


In [None]:
movie1 = 'Harry Potter and the Half-Blood Prince'
movie2 = 'Harry Potter and the Order of the Phoenix'

i1 = new_df[new_df['title'] == movie1].index[0]
i2 = new_df[new_df['title'] == movie2].index[0]

print(f"Similarity between '{movie1}' and '{movie2}': {similarity[i1][i2]:.4f}")

Similarity between 'Harry Potter and the Half-Blood Prince' and 'Harry Potter and the Order of the Phoenix': 0.7791


In [None]:
movie1 = 'Harry Potter and the Half-Blood Prince'
movie2 = 'Harry Potter and the Order of the Phoenix'

tags1 = new_df[new_df['title'] == movie1]['tags'].values[0]
tags2 = new_df[new_df['title'] == movie2]['tags'].values[0]

print(f"Tags for '{movie1}':\n{tags1}\n")
print(f"Tags for '{movie2}':\n{tags2}")


Tags for 'Harry Potter and the Half-Blood Prince':
as harri begin hi sixth year at hogwarts, he discov an old book mark as 'properti of the half-blood prince', and begin to learn more about lord voldemort' dark past. adventur fantasi famili witch magic broom schoolofwitchcraft wizardri apparit teenagecrush werewolf danielradcliff rupertgrint emmawatson davidy

Tags for 'Harry Potter and the Order of the Phoenix':
return for hi fifth year of studi at hogwarts, harri is stun to find that hi warn about the return of lord voldemort have been ignored. left with no choice, harri take matter into hi own hands, train a small group of student – dub 'dumbledore' army' – to defend themselv against the dark arts. adventur fantasi famili mysteri propheci witch lossoflov magic cuttingthecord childhero dyinganddeath broom sorcerer'sapprentic schoolofwitchcraft blackmag deathofafriend sorceri occult danielradcliff rupertgrint emmawatson davidy


In [None]:
from sklearn.neighbors import NearestNeighbors

movies_df = new_df.rename(columns={'id': 'movieId'}).copy()

user_ids = [1, 2, 3, 4, 5]
likes_data = []

# Everyone gets a common core set + a few unique
common_movies = movies_df['movieId'].sample(5, random_state=1).tolist()

for user in user_ids:
    user_specific = movies_df['movieId'].sample(5, random_state=user).tolist()
    liked = list(set(common_movies + user_specific))
    for movie_id in liked:
        likes_data.append({'userId': user, 'movieId': movie_id})

likes_df = pd.DataFrame(likes_data)


# Step 2: Build user-movie matrix
def build_user_movie_matrix(likes_df):
    return likes_df.assign(like=1).pivot_table(index='userId', columns='movieId', values='like').fillna(0)

user_movie_matrix = build_user_movie_matrix(likes_df)

# Step 3: CF Recommendation function
def get_cf_recommendations(user_id, user_movie_matrix, movies_df, top_n=10, k_neighbors=5):
    if user_id not in user_movie_matrix.index:
        print("User not found.")
        return []

    user_vector = user_movie_matrix.loc[user_id].values.reshape(1, -1)

    n_neighbors = min(k_neighbors + 1, len(user_movie_matrix))
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors)
    knn.fit(user_movie_matrix)

    distances, indices = knn.kneighbors(user_vector)
    sim_users = user_movie_matrix.index[indices.flatten()[1:]]  # exclude self
    sim_distances = distances.flatten()[1:]

    target_likes = user_movie_matrix.loc[user_id]
    liked_by_user = set(target_likes[target_likes > 0].index)

    scores = {}
    for i, sim_user in enumerate(sim_users):
        sim_score = 1 - sim_distances[i]
        sim_likes = user_movie_matrix.loc[sim_user]
        for movie_id in sim_likes[sim_likes > 0].index:
            if movie_id not in liked_by_user:
                scores[movie_id] = scores.get(movie_id, 0) + sim_score

    if not scores:
        print("No recommendations could be generated.")
        return []

    recs = pd.DataFrame(scores.items(), columns=['movieId', 'cf_score'])
    recs = recs.sort_values(by='cf_score', ascending=False).head(top_n)
    recs = pd.merge(recs, movies_df[['movieId', 'title']], on='movieId', how='left')

    print(f"\nPeople like you like:")
    for title in recs['title']:
        print(title)

    return recs[['title', 'cf_score']]


get_cf_recommendations(user_id=1, user_movie_matrix=user_movie_matrix, movies_df=movies_df, top_n=5)



People like you like:
Boat Trip
Hot Tub Time Machine
Never Say Never Again
Let Him Go
Landscape with Invisible Hand


Unnamed: 0,title,cf_score
0,Boat Trip,0.707107
1,Hot Tub Time Machine,0.707107
2,Never Say Never Again,0.707107
3,Let Him Go,0.707107
4,Landscape with Invisible Hand,0.707107


In [None]:
import pickle

# Saving the content-based model
pickle.dump(new_df, open("cbf_movie_data.pkl", "wb"))
pickle.dump(similarity, open("cbf_similarity.pkl", "wb"))

# Saving collaborative model dependencies
pickle.dump(movies_df, open("cf_movies.pkl", "wb"))
pickle.dump(likes_df, open("cf_likes.pkl", "wb"))


In [None]:
from google.colab import files

files.download("cbf_movie_data.pkl")
files.download("cbf_similarity.pkl")
files.download("cf_movies.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>