In [5]:
!pip install chromadb



In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import chromadb

In [7]:
import pandas as pd
import numpy as np

In [8]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [9]:
ratings['watched'] = 1

In [10]:
# Create the interaction matrix (users as rows, movies as columns)
interaction_matrix = ratings.pivot_table(
    index='userId',
    columns='movieId',
    values='watched',
    fill_value=0
)

# Display the first few rows
interaction_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
user_similarity = cosine_similarity(interaction_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index = interaction_matrix.index, columns = interaction_matrix.index)
user_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.024383,0.07359,0.201021,0.128669,0.122266,0.138455,0.143648,0.0484,0.033292,...,0.07186,0.152564,0.235176,0.078784,0.154571,0.165157,0.288062,0.309738,0.09714,0.125545
2,0.024383,1.0,0.0,0.012635,0.027995,0.020959,0.045186,0.027086,0.0,0.078471,...,0.203251,0.015982,0.012094,0.0,0.0,0.027806,0.013579,0.03865,0.030528,0.092633
3,0.07359,0.0,1.0,0.010895,0.02414,0.02711,0.0,0.023357,0.0,0.0,...,0.031867,0.027563,0.04693,0.0,0.010771,0.038364,0.046839,0.049993,0.0,0.031064
4,0.201021,0.012635,0.010895,1.0,0.123091,0.103675,0.121415,0.089324,0.010032,0.051755,...,0.101556,0.13469,0.307987,0.061237,0.073231,0.195617,0.144295,0.174664,0.033558,0.099941
5,0.128669,0.027995,0.02414,0.123091,1.0,0.306275,0.110051,0.41781,0.0,0.038224,...,0.060003,0.402224,0.122732,0.286436,0.141973,0.10384,0.165365,0.15689,0.247841,0.058492


Step 3: Recommendation for target user

In [12]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,watched
0,1,1,4.0,964982703,1
1,1,3,4.0,964981247,1
2,1,6,4.0,964982224,1
3,1,47,5.0,964983815,1
4,1,50,5.0,964982931,1


In [13]:
def recommend_movies_for_user(target_user, top_n=5, top_sim_users=10):
    similar_users = user_similarity_df[target_user].sort_values(ascending=False)[1:]
    top_similar_users = similar_users.head(top_sim_users).index

    watched_by_similar = (
        interaction_matrix.loc[top_similar_users]
        .sum()
    )

    user_watched = interaction_matrix.loc[target_user]
    not_watched = user_watched[user_watched == 0]

    recommendations = watched_by_similar[not_watched.index]
    recommendations = recommendations.sort_values(ascending=False).head(top_n)

    rec_df = (
        recommendations.rename('score')
        .reset_index()
        .merge(movies_df[['movieId','title']], on='movieId', how='left')
        .sort_values('score', ascending=False)
        .reset_index(drop=True)
    )
    return rec_df[['movieId', 'title', 'score']]


In [15]:
def get_recommended_movies(
    target_user_id,
    interaction_matrix,
    similarity_matrix,
    top_n_similar_users=10,
    top_n_recommendations=5
):
    import numpy as np

    user_index = interaction_matrix.index.get_loc(target_user_id)
    similarity_scores = similarity_matrix[user_index]
    similar_user_indices = np.argsort(similarity_scores)[::-1][1:top_n_similar_users+1]
    similar_user_ids = interaction_matrix.index[similar_user_indices]
    similar_users_data = interaction_matrix.loc[similar_user_ids]
    movie_scores = similar_users_data.sum(axis=0)
    watched_movies = interaction_matrix.loc[target_user_id]
    unwatched_movies = watched_movies[watched_movies == 0].index
    movie_scores = movie_scores[unwatched_movies]
    top_recommendations = movie_scores.sort_values(ascending=False).head(top_n_recommendations)

    return top_recommendations



In [16]:
def print_recommended_movies(recommendations, movies_df, target_user_id):
    movie_map = dict(zip(movies_df.movieId, movies_df.title))

    print(f"\n Top {len(recommendations)} Recommendations for User {target_user_id}:\n")
    for movie_id, score in recommendations.items():
        title = movie_map.get(movie_id, "Unknown Title")
        print(f"{movie_id}: {title} (Score: {score:.1f})")



In [18]:
import pandas as pd


movies_df = pd.read_csv("movies.csv")


user_id = 3

recommendations = get_recommended_movies(
    target_user_id=user_id,
    interaction_matrix=interaction_matrix,
    similarity_matrix=user_similarity,
    top_n_similar_users=10,
    top_n_recommendations=5
)


print_recommended_movies(recommendations, movies_df, user_id)



 Top 5 Recommendations for User 3:

1214: Alien (1979) (Score: 8.0)
1200: Aliens (1986) (Score: 8.0)
1196: Star Wars: Episode V - The Empire Strikes Back (1980) (Score: 8.0)
1259: Stand by Me (1986) (Score: 8.0)
1: Toy Story (1995) (Score: 7.0)


Step 5

In [19]:
ratings["watched"] = 1

interaction_matrix = ratings.pivot_table(
    index="userId", columns="movieId", values="watched", fill_value=0
).astype(np.float32)


user_matrix = normalize(interaction_matrix.values, norm="l2", axis=1)
user_ids = interaction_matrix.index.astype(str).tolist()

movie_ids = interaction_matrix.columns.tolist()
col_to_movie = {i: mid for i, mid in enumerate(movie_ids)}


client = chromadb.PersistentClient(path="./chroma_user_vectors")


try:
    client.delete_collection("user_vectors")
except:
    pass

collection = client.create_collection(
    name="user_vectors",
    metadata={"hnsw:space": "cosine"}
)


collection.add(
    ids=user_ids,
    embeddings=user_matrix.tolist(),
    metadatas=[{"userId": uid} for uid in user_ids]
)


def nearest_users(target_user, k=10):
    row_idx = interaction_matrix.index.get_loc(target_user)
    target_vector = user_matrix[row_idx].tolist()

    result = collection.query(
        query_embeddings=[target_vector],
        n_results=k+1
    )

    ids = [int(u) for u in result["ids"][0]]
    distances = result["distances"][0]


    if ids[0] == target_user:
        ids = ids[1:]
        distances = distances[1:]

    return ids, distances


def recommend_for_user(target_user, top_n=10, k_neighbors=10):
    neighbors, distances = nearest_users(target_user, k=k_neighbors)
    sims = 1 - np.array(distances)

    neighbor_matrix = interaction_matrix.loc[neighbors].values
    weighted_scores = (neighbor_matrix.T @ sims)

    target_seen = interaction_matrix.loc[target_user].values.astype(bool)
    weighted_scores[target_seen] = 0

    top_idx = np.argsort(weighted_scores)[::-1][:top_n]
    top_movie_ids = [col_to_movie[i] for i in top_idx]
    top_scores = weighted_scores[top_idx]

    recs = pd.DataFrame({"movieId": top_movie_ids, "score": top_scores})
    recs = recs.merge(movies[["movieId", "title"]], on="movieId", how="left")
    return recs[["movieId", "title", "score"]]


if __name__ == "__main__":
    user_id = 1
    recommendations = recommend_for_user(user_id, top_n=10, k_neighbors=10)
    print(f"Top recommendations for User {user_id}:")
    print(recommendations)

Top recommendations for User 1:
   movieId                              title     score
0     1968         Breakfast Club, The (1985)  3.360427
1      924       2001: A Space Odyssey (1968)  3.360427
2     1610   Hunt for Red October, The (1990)  3.039627
3      589  Terminator 2: Judgment Day (1991)  3.039627
4      858              Godfather, The (1972)  3.032563
5     1200                      Aliens (1986)  3.031750
6     1036                    Die Hard (1988)  3.022057
7     2762            Sixth Sense, The (1999)  2.711763
8     2791                   Airplane! (1980)  2.704796
9      541                Blade Runner (1982)  2.701217


In [20]:
import pandas as pd
import numpy as np

# ===== Load datasets =====
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

# Step 1: Create interaction matrix
ratings['watched'] = 1
interaction_matrix = ratings.pivot_table(index='userId', columns='movieId', values='watched', fill_value=0)

# Step 2: Calculate similarity
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(interaction_matrix)
user_similarity = pd.DataFrame(user_similarity, index=interaction_matrix.index, columns=interaction_matrix.index)

# Step 3 & 4: Recommendation function
def get_recommended_movies(target_user_id, interaction_matrix, similarity_matrix, top_n_similar_users=10, top_n_recommendations=5):
    import numpy as np

    if target_user_id not in interaction_matrix.index:
        return pd.Series(dtype=float)  # No data for this user

    user_index = interaction_matrix.index.get_loc(target_user_id)
    similarity_scores = similarity_matrix.iloc[user_index]
    similar_user_indices = np.argsort(similarity_scores)[::-1][1:top_n_similar_users+1]
    similar_user_ids = interaction_matrix.index[similar_user_indices]
    similar_users_data = interaction_matrix.loc[similar_user_ids]
    movie_scores = similar_users_data.sum(axis=0)
    watched_movies = interaction_matrix.loc[target_user_id]
    unwatched_movies = watched_movies[watched_movies == 0].index
    movie_scores = movie_scores[unwatched_movies]
    top_recommendations = movie_scores.sort_values(ascending=False).head(top_n_recommendations)
    return top_recommendations

def print_recommended_movies(recommendations, movies_df, target_user_id):
    movie_map = dict(zip(movies_df.movieId, movies_df.title))
    if recommendations.empty:
        print(f"⚠ No recommendations available for User {target_user_id}.")
        return
    print(f"\n🎬 Top {len(recommendations)} Recommendations for User {target_user_id}:\n")
    for movie_id, score in recommendations.items():
        title = movie_map.get(movie_id, "Unknown Title")
        print(f"{movie_id}: {title} (Score: {score:.1f})")

# List of test users (some real, some cold start)
test_users = [1, 5, 10, 50, 200, 610, 9999]  # 9999 = cold start

for uid in test_users:
    print(f"\n===== Testing for User {uid} =====")
    recs = get_recommended_movies(uid, interaction_matrix, user_similarity)
    print_recommended_movies(recs, movies, uid)


# ===== Test Case 2: Cold start user =====
cold_user_id = 9999  # Doesn't exist in dataset
print("\n⚠ Cold Start User (No History):")
recs = get_recommended_movies(cold_user_id, interaction_matrix, user_similarity)
print_recommended_movies(recs, movies, cold_user_id)

# ===== Test Case 3: Items with few interactions =====
print("\n🔍 Checking movies with few interactions (< 5 ratings):")
rare_movies = ratings['movieId'].value_counts()[ratings['movieId'].value_counts() < 5]
print(f"Found {len(rare_movies)} rare movies.")
print(rare_movies.head())

# ===== Test Case 4: Very popular items =====
print("\n🌟 Checking top 5 most popular movies:")
popular_movies = ratings['movieId'].value_counts().head()
popular_movie_titles = movies[movies['movieId'].isin(popular_movies.index)]
print(popular_movie_titles)



===== Testing for User 1 =====

🎬 Top 5 Recommendations for User 1:

924: 2001: A Space Odyssey (1968) (Score: 10.0)
1968: Breakfast Club, The (1985) (Score: 10.0)
1610: Hunt for Red October, The (1990) (Score: 9.0)
858: Godfather, The (1972) (Score: 9.0)
1200: Aliens (1986) (Score: 9.0)

===== Testing for User 5 =====

🎬 Top 5 Recommendations for User 5:

356: Forrest Gump (1994) (Score: 8.0)
434: Cliffhanger (1993) (Score: 8.0)
165: Die Hard: With a Vengeance (1995) (Score: 8.0)
208: Waterworld (1995) (Score: 7.0)
480: Jurassic Park (1993) (Score: 7.0)

===== Testing for User 10 =====

🎬 Top 5 Recommendations for User 10:

79132: Inception (2010) (Score: 8.0)
1265: Groundhog Day (1993) (Score: 7.0)
78499: Toy Story 3 (2010) (Score: 7.0)
69122: Hangover, The (2009) (Score: 7.0)
6539: Pirates of the Caribbean: The Curse of the Black Pearl (2003) (Score: 6.0)

===== Testing for User 50 =====

🎬 Top 5 Recommendations for User 50:

91529: Dark Knight Rises, The (2012) (Score: 10.0)
59315