In [7]:
import pandas as pd
import numpy as np
import os


script_dir = os.getcwd() 

print(f"Current working directory: {script_dir}")

Current working directory: c:\Users\willi\OneDrive\Documents\GitHub\Test\Movie-Recommendation


In [None]:
# Load ratings data
ratings_file = os.path.join(script_dir, "Cleaned Datasets", "ratings_imdb_matched.csv")
df_ratings = pd.read_csv(ratings_file)

# print(df_ratings.head(10))

   userId  imdbId  rating
0       1  114709     0.8
1       1  113228     0.8
2       1  113277     0.8
3       1  114369     1.0
4       1  114814     1.0
5       1  116367     0.6
6       1  115734     1.0
7       1  112573     0.8
8       1  114287     1.0
9       1  109370     1.0


In [10]:
# Create user-item matrix
user_item_matrix = df_ratings.pivot(index="userId", columns="imdbId", values="rating").fillna(0)

# Convert to NumPy array
ratings_matrix = user_item_matrix.values

print(user_item_matrix.head(10))


imdbId  417      439      516      4972     6333     6864     7264     \
userId                                                                  
1           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
2           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
3           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
4           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
5           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
6           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
7           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
8           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
9           0.0      0.0      0.0      0.0      0.0      0.0      0.0   
10          0.0      0.0      0.0      0.0      0.0      0.0      0.0   

imdbId  7361     8133     10040    ...  7539884  7544820  7620650  7681902  \
userId                             ...       

In [11]:
from numpy.linalg import norm

# Compute cosine similarity manually
def cosine_similarity(movie1, movie2):
    dot_product = np.dot(movie1, movie2)
    norm_product = norm(movie1) * norm(movie2)
    return dot_product / norm_product if norm_product != 0 else 0

# Create similarity matrix
num_movies = ratings_matrix.shape[1]
similarity_matrix = np.zeros((num_movies, num_movies))

for i in range(num_movies):
    for j in range(num_movies):
        similarity_matrix[i, j] = cosine_similarity(ratings_matrix[:, i], ratings_matrix[:, j])

# Convert to DataFrame
movie_similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix.columns, columns=user_item_matrix.columns)

print(movie_similarity_df.head(10))


imdbId   417       439      516       4972     6333      6864     7264     \
imdbId                                                                      
417     1.000000  0.107161      0.0  0.000000      0.0  0.000000      0.0   
439     0.107161  1.000000      0.0  0.970143      0.0  0.970143      0.0   
516     0.000000  0.000000      1.0  0.000000      0.0  0.000000      1.0   
4972    0.000000  0.970143      0.0  1.000000      0.0  1.000000      0.0   
6333    0.000000  0.000000      0.0  0.000000      1.0  0.000000      0.0   
6864    0.000000  0.970143      0.0  1.000000      0.0  1.000000      0.0   
7264    0.000000  0.000000      1.0  0.000000      0.0  0.000000      1.0   
7361    0.378717  0.000000      0.0  0.000000      0.0  0.000000      0.0   
8133    0.568075  0.000000      0.0  0.000000      0.0  0.000000      0.0   
10040   0.000000  0.000000      0.0  0.000000      0.0  0.000000      0.0   

imdbId   7361      8133     10040    ...   7539884  7544820  7620650  76819

In [12]:
def recommend_movies(user_id, num_recommendations=5):
    # Get movies rated by the user
    user_ratings = user_item_matrix.loc[user_id]
    
    # Get only the movies they rated
    rated_movies = user_ratings[user_ratings > 0].index.tolist()
    
    # Compute weighted sum of similarities for each unrated movie
    movie_scores = {}
    for movie in rated_movies:
        similar_movies = movie_similarity_df[movie].sort_values(ascending=False)
        for similar_movie, similarity_score in similar_movies.items():
            if similar_movie not in rated_movies:
                if similar_movie not in movie_scores:
                    movie_scores[similar_movie] = 0
                movie_scores[similar_movie] += similarity_score * user_ratings[movie]
    
    # Sort by highest score
    sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
    
    return [movie for movie, score in sorted_movies[:num_recommendations]]

# Example: Recommend 5 movies for user 1
print("Recommended movies:", recommend_movies(user_id=1, num_recommendations=5))


Recommended movies: [91042, 95016, 88847, 119116, 90605]
