# Movie Recommendation System

## Library Imports and Constants Initialization

In [43]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.neighbors import NearestNeighbors

## Data Loading

In [44]:
movies_df = pd.read_csv(os.path.join(os.getcwd(), 'data', "movies.csv"))
ratings_df = pd.read_csv(os.path.join(os.getcwd(), 'data', "ratings.csv")).iloc[:500000, :] # Using 500,000 for now due to sheer size of original
tags_df = pd.read_csv(os.path.join(os.getcwd(), 'data', "tags.csv"))
genome_scores_df = pd.read_csv(os.path.join(os.getcwd(), 'data', "genome-scores.csv"))
genome_tags_df = pd.read_csv(os.path.join(os.getcwd(), 'data', "genome-tags.csv"))

## Dataset Exploration

### Movies Info

In [45]:
print("\nGeneral information of the movies' dataset:")
print(movies_df.info())

print("------------------------------------------------------------------")

print("\nHead of the movies' dataset:")
print(movies_df.head())

print("------------------------------------------------------------------")

print("\nShape of the movies' dataset:", movies_df.shape)

print("------------------------------------------------------------------")

print("\nMissing values of the movies' dataset:")
print(movies_df.isnull().sum())


General information of the movies' dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
None
------------------------------------------------------------------

Head of the movies' dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3            

### Ratings Info

In [46]:
print("\nGeneral information of the ratings' dataset:")
print(ratings_df.info())
# Timestamp unnecessary

print("------------------------------------------------------------------")

print("\nHead of the ratings' dataset:")
print(ratings_df.head())

print("------------------------------------------------------------------")

print("\nShape of the ratings' dataset:", ratings_df.shape)

print("------------------------------------------------------------------")

print("\nMissing values of the ratings' dataset:")
print(ratings_df.isnull().sum())


General information of the ratings' dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     500000 non-null  int64  
 1   movieId    500000 non-null  int64  
 2   rating     500000 non-null  float64
 3   timestamp  500000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 15.3 MB
None
------------------------------------------------------------------

Head of the ratings' dataset:
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510
------------------------------------------------------------------

Shape of the ratings' dataset: (500000, 4)
------------------------------------------------------------------

Missing values of the ratings' 

### Tags Info

In [47]:
print("\nGeneral information of the tags' dataset:")
print(tags_df.info())
# Timestamp unnecessary
# Tags are user defined, so a lot of them (although the same) will be spelled differently. 
    # i.e. sci-fi vs scifi, 90's vs 90s, and Horror vs horror

print("------------------------------------------------------------------")

print("\nHead of the tags' dataset:")
print(tags_df.head())

print("------------------------------------------------------------------")

print("\nShape of the tags' dataset:", tags_df.shape)
print("------------------------------------------------------------------")

print("\nMissing values of the tags' dataset:")
print(tags_df.isnull().sum())
# 16 missing values

print("------------------------------------------------------------------")

print("\nUnique values tags' in dataset:")
print(tags_df.tag.nunique())


General information of the tags' dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093360 entries, 0 to 1093359
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1093360 non-null  int64 
 1   movieId    1093360 non-null  int64 
 2   tag        1093344 non-null  object
 3   timestamp  1093360 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.4+ MB
None
------------------------------------------------------------------

Head of the tags' dataset:
   userId  movieId               tag   timestamp
0       3      260           classic  1439472355
1       3      260            sci-fi  1439472256
2       4     1732       dark comedy  1573943598
3       4     1732    great dialogue  1573943604
4       4     7569  so bad it's good  1573943455
------------------------------------------------------------------

Shape of the tags' dataset: (1093360, 4)
---------------------------------------------

## Initial Data Preprocessing for Ratings, and Tags

### Ratings Preprocessing

In [48]:
print(ratings_df.head())
ratings_cleaned = ratings_df.copy()
ratings_cleaned = ratings_cleaned.drop(columns=['timestamp'])
ratings_cleaned = ratings_cleaned[ratings_cleaned['rating'] >= 4.0]


   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


### Tags Preprocessing

In [49]:
def preprocess_tags(tag):
    """Preprocess tags by lowercasing, removing parentheses, hyphens, and 'based'."""
    if pd.isna(tag):
        return tag  # Return NaN as is
    tag = tag.lower()
    tag = re.sub(r'\(.*?\)', '', tag)
    tag = tag.replace('-', '')
    return tag.strip()

In [50]:
tags_cleaned = tags_df.copy()
print(tags_df.head())

tags_cleaned = tags_cleaned.drop(columns=['timestamp'])
tags_cleaned = tags_cleaned.dropna()

tags_cleaned['tag'] = tags_cleaned['tag'].apply(preprocess_tags)
tags_cleaned = tags_cleaned[~tags_cleaned['tag'].str.contains('based', na=False)]
print(tags_cleaned.head())

   userId  movieId               tag   timestamp
0       3      260           classic  1439472355
1       3      260            sci-fi  1439472256
2       4     1732       dark comedy  1573943598
3       4     1732    great dialogue  1573943604
4       4     7569  so bad it's good  1573943455
   userId  movieId               tag
0       3      260           classic
1       3      260             scifi
2       4     1732       dark comedy
3       4     1732    great dialogue
4       4     7569  so bad it's good


### Movies Preprocessing

In [51]:
def preprocess_movies(genre):
    """Convert genre strings into lowercase."""
    return genre.lower()  # Make sure to return the processed genre

In [52]:
movies_cleaned = movies_df.copy()  # Create a copy to avoid modifying the original DataFrame
print(movies_df.head())

movies_cleaned['genres'] = movies_cleaned['genres'].apply(preprocess_movies)
print(movies_cleaned.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  adventure|animation|children|comedy|fantasy  
1     

## Merging datasets to create Movies-based dataset, movies_genres&tags

### Creating initial movies_genres&tags dataset

In [53]:
# Merge movies and tags data
merged_data = pd.merge(movies_cleaned, tags_cleaned, on='movieId', how='left')
def combine_and_deduplicate(group):
    # Combine all genres and tags for each group (movie)
    all_genres = group['genres'].iloc[0].split('|') if pd.notna(group['genres'].iloc[0]) else []
    all_tags = group['tag'].dropna().tolist()
    combined = list(set(all_genres + all_tags))  # Remove duplicates
    return '|'.join(combined)

# Apply the function to each group
movies_genres_tags = merged_data.groupby('movieId').apply(combine_and_deduplicate).reset_index(name='genres&tags')

# Merge with movies_cleaned to get titles
movies_genres_tags = pd.merge(movies_cleaned[['movieId', 'title']], movies_genres_tags, on='movieId')

print(movies_genres_tags.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                         genres&tags  
0  villian hurts toys|dvdvideo|boy|watched|rivalr...  
1  joe johnston|new home|time travel|animals|film...  
2  fishing|comedy|comedinha de velhinhos engraãƒâ...  
3  comedy|romance|divorce|interracial relationshi...  
4  comedy|aging|pregnancy|diane keaton|sequel fev...  


### Adding relevance scores for each genre/tag to the movies-based dataset

In [54]:
# Merge genome scores and tags
genome_combined = pd.merge(genome_scores_df, genome_tags_df, on='tagId')

# Creating a dictionary for quick relevance score lookup
genome_dict = genome_combined.set_index(['movieId', 'tag'])['relevance'].to_dict()

In [55]:
# Function to find relevance scores using the dictionary
def find_relevance_scores(row):
    genres_tags = row['genres&tags'].split('|')
    scores = [genome_dict.get((row['movieId'], tag), 0) for tag in genres_tags]
    # Filter out scores below 0.6 and corresponding tags/genres
    filtered_data = [(tag, score) for tag, score in zip(genres_tags, scores) if score >= 0.7]
    if filtered_data:
        filtered_tags, filtered_scores = zip(*filtered_data)
        return '|'.join(filtered_tags), ','.join(map(str, filtered_scores))
    else:
        return '', ''

# Apply the function to movies_genres_tags
movies_genres_tags[['genres&tags', 'relevance_scores']] = movies_genres_tags.apply(find_relevance_scores, axis=1, result_type='expand')

# Save the updated dataset in the data folder
movies_genres_tags.to_csv('data/movies_genres&tags.csv', index=False)

## Merging datasets to create users-based dataset, users_liked_genres&tags

In [56]:
# Merge ratings_cleaned with movies_cleaned
user_movies_genres = pd.merge(ratings_cleaned, movies_cleaned, on='movieId')

# Merge the above with tags_cleaned
user_movies_genres_tags = pd.merge(user_movies_genres, tags_cleaned, on=['userId', 'movieId'], how='left')

# Function to concatenate genres and user-specific tags
def concatenate_user_genres_tags(group):
    genres = '|'.join(group['genres'])
    tags = '|'.join(group['tag'].dropna().unique())  # Drop NA and get unique tags
    return genres + ('|' + tags if tags else '')  # Combine genres and tags, separated by '|'

# Aggregate genres and tags for each user
users_liked_genres_tags = user_movies_genres_tags.groupby('userId').apply(concatenate_user_genres_tags).reset_index(name='liked_genres&tags')

users_liked_genres_tags.to_csv('data/users_liked_genres&tags.csv', index=False)

## Finding Users with Bias(Used Later for Shallow Evaluations)

In [57]:
# Function to calculate the comedy bias ratio for a user
def calculate_comedy_bias_ratio(user_genres_tags):
    genres_tags_list = user_genres_tags.split('|')
    genre_tag_counts = pd.Series(genres_tags_list).value_counts()
    comedy_count = genre_tag_counts.get('comedy', 0)
    total_count = genre_tag_counts.sum()
    # Avoid division by zero in case there are no other genres/tags
    bias_ratio = comedy_count / (total_count - comedy_count) if total_count - comedy_count > 0 else 0
    return comedy_count, bias_ratio

# Limit to the first 10,000 users
subset_users = users_liked_genres_tags.head(10000)

# Apply the comedy bias ratio calculation to this subset
subset_comedy_bias = subset_users.set_index('userId')['liked_genres&tags'].apply(calculate_comedy_bias_ratio)

# Create a DataFrame to hold the results for the subset
subset_comedy_bias_df = pd.DataFrame(subset_comedy_bias.tolist(), index=subset_comedy_bias.index, columns=['Comedy Count', 'Comedy Bias Ratio'])

# Sort the users by their comedy bias ratio in descending order
subset_comedy_bias_df = subset_comedy_bias_df.sort_values(by='Comedy Bias Ratio', ascending=False)

# Get the top 5 users with the highest comedy bias
top_5_comedy_bias_users = subset_comedy_bias_df[subset_comedy_bias_df['Comedy Count'] > 0].head(5)
print(top_5_comedy_bias_users)

        Comedy Count  Comedy Bias Ratio
userId                                 
1267               1           1.000000
3195              20           1.000000
1368              21           0.954545
2133              14           0.777778
1369              10           0.769231


## Functions to Calculate the Average for a List Genres/Tags

### Loading glove Embeddings

In [58]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

glove_file_path = 'data/glove.6B.50d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path)

### users_liked_genres&tags embeddings

In [59]:
# Function to calculate the average embedding for genres/tags
def average_embedding_user(genres_tags, embeddings_index):
    embedding_dim = 50
    embeddings = []
    for phrase in genres_tags.split('|'):
        words = phrase.split()
        phrase_embeddings = [embeddings_index.get(word, np.zeros(embedding_dim)) for word in words]
        embeddings.append(np.mean(phrase_embeddings, axis=0) if phrase_embeddings else np.zeros(embedding_dim))
    avg_embedding = np.mean(embeddings, axis=0) if embeddings else np.zeros(embedding_dim)
    return avg_embedding

# Apply the function to each user
users_liked_genres_tags['user_embedding'] = users_liked_genres_tags['liked_genres&tags'].apply(lambda x: average_embedding_user(x, glove_embeddings))
users_liked_genres_tags.to_csv('data/users_liked_genres&tags.csv', index=False)

### movies_genres&tags embeddings

In [60]:
# Function to calculate the weighted average embedding for movies
def average_embedding_movie(genres_tags, relevance_scores, embeddings_index):
    embedding_dim = 50
    genres_tags_list = genres_tags.split('|')
    scores_list = relevance_scores.split(',') if relevance_scores else [1] * len(genres_tags_list)
    weighted_embeddings = []
    for tag, score in zip(genres_tags_list, scores_list):
        words = tag.split()
        phrase_embeddings = [embeddings_index.get(word, np.zeros(embedding_dim)) for word in words]
        avg_phrase_embedding = np.mean(phrase_embeddings, axis=0) if phrase_embeddings else np.zeros(embedding_dim)
        weighted_embeddings.append(avg_phrase_embedding * float(score))
    avg_embedding = np.mean(weighted_embeddings, axis=0) if weighted_embeddings else np.zeros(embedding_dim)
    return avg_embedding

# Apply the function to each movie
movies_genres_tags['movie_embedding'] = movies_genres_tags.apply(lambda x: average_embedding_movie(x['genres&tags'], x['relevance_scores'], glove_embeddings), axis=1)
movies_genres_tags.to_csv('data/movies_genres&tags.csv', index=False)

## Training our KNN Model and Testing with Specific User

In [64]:
# Prepare the data
movie_embeddings = np.stack(movies_genres_tags['movie_embedding'].values)
user_embedding_3195 = users_liked_genres_tags.loc[users_liked_genres_tags['userId'] == 3195, 'user_embedding'].values[0]

# Build the KNN model
# Set KNN to 10,000 neighbors
knn = NearestNeighbors(n_neighbors=10000, metric='cosine')
knn.fit(movie_embeddings)

# Query the model for user 3195
_, indices_3195 = knn.kneighbors([user_embedding_3195])

## Evaluation

In [66]:

# Get the recommended movie IDs for different values of k
recommended_ids_at_k = {
    '10': set(movies_genres_tags.iloc[indices_3195[0][:10]]['movieId'].values),
    '100': set(movies_genres_tags.iloc[indices_3195[0][:100]]['movieId'].values),
    '1000': set(movies_genres_tags.iloc[indices_3195[0][:1000]]['movieId'].values),
    '10000': set(movies_genres_tags.iloc[indices_3195[0]]['movieId'].values)
}

# User 3195's originally liked movies
user_3195_liked_movies = set(ratings_cleaned[ratings_cleaned['userId'] == 3195]['movieId'].values)

# Calculate the total number of movies user 3195 has watched and liked
total_watched_by_3195 = len(user_3195_liked_movies)
print(f"Total movies watched and liked by User 3195: {total_watched_by_3195}")

# Calculate hits@k
hits_at_k = {k: len(user_3195_liked_movies & recommended_ids) for k, recommended_ids in recommended_ids_at_k.items()}

# Print hits@k results
print("Hits@k for User 3195:")
for k, hits in hits_at_k.items():
    print(f"hits@{k}: {hits}")

# Print the top 10 recommended movies
top_10_recommended_titles = movies_genres_tags[movies_genres_tags['movieId'].isin(recommended_ids_at_k['10'])]['title'].values
print("\nTop 10 movie recommendations for User 3195:")
for i, title in enumerate(top_10_recommended_titles, start=1):
    print(f"{i}. {title}")

Total movies watched and liked by User 3195: 20
Hits@k for User 3195:
hits@10: 0
hits@100: 0
hits@1000: 3
hits@10000: 19

Top 10 movie recommendations for User 3195:
1. Blast from the Past (1999)
2. Mr. Saturday Night (1992)
3. Punchline (1988)
4. Head Over Heels (2001)
5. America's Sweethearts (2001)
6. Guarding Tess (1994)
7. Bachelor Mother (1939)
8. Roxanne (1987)
9. More the Merrier, The (1943)
10. $5 a Day (2008)


## Old Embedding and KNN implementations

In [61]:
'''
embedding_dim = 50

def average_embedding_user(words_to_embed, glove_embeddings, embedding_dim):
    return np.mean([glove_embeddings.get(word,np.zeros(embedding_dim)) for word in words_to_embed])

def average_embedding_movie(words_to_embed, glove_embeddings, embedding_dim, relevance_scores):

    embeddings = []

    for i, word in enumerate(words_to_embed):
        for j in range(len(relevance_scores[i])):
            embeddings.append(relevance_scores[i][j] * glove_embeddings.get(word, np.zeros(embedding_dim)))

    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding


#creates 2d array for multiplying releavance scores with embedding
def unpacking_relevance_score(relevance_scores):

    res = []
  
    for total_score in relevance_scores:
        if total_score == '':
            res.append([])
            continue
        total_score =  total_score.split(',')
        total_score = np.array([float(single_score)for single_score in total_score])
        res.append(total_score)
    return res

relevance_scores = unpacking_relevance_score(movies_genres_tags['relevance_scores'].values)

# Apply the function to calculate average embeddings for each movie
movies_genres_tags['movie_embedding'] = movies_genres_tags['genres&tags'].apply(
    lambda x: average_embedding_movie(x.split('|'), glove_embeddings, embedding_dim, relevance_scores)
)

users_liked_genres_tags['user_embedding'] = users_liked_genres_tags['liked_genres&tags'].apply(
    lambda x: average_embedding_user(x.split('|'), glove_embeddings, embedding_dim)
)

users_liked_genres_tags.to_csv('data/users_liked_genres&tags.csv', index=False)
movies_genres_tags.to_csv('data/movies_genres&tags.csv', index=False)
ratings_over_4.to_csv('data/ratings_over_4.csv', index=False)
'''


"\nembedding_dim = 50\n\ndef average_embedding_user(words_to_embed, glove_embeddings, embedding_dim):\n    return np.mean([glove_embeddings.get(word,np.zeros(embedding_dim)) for word in words_to_embed])\n\ndef average_embedding_movie(words_to_embed, glove_embeddings, embedding_dim, relevance_scores):\n\n    embeddings = []\n\n    for i, word in enumerate(words_to_embed):\n        for j in range(len(relevance_scores[i])):\n            embeddings.append(relevance_scores[i][j] * glove_embeddings.get(word, np.zeros(embedding_dim)))\n\n    avg_embedding = np.mean(embeddings, axis=0)\n    return avg_embedding\n\n\n#creates 2d array for multiplying releavance scores with embedding\ndef unpacking_relevance_score(relevance_scores):\n\n    res = []\n  \n    for total_score in relevance_scores:\n        if total_score == '':\n            res.append([])\n            continue\n        total_score =  total_score.split(',')\n        total_score = np.array([float(single_score)for single_score in total

In [63]:
'''
sample_movies = np.array(movies_genres_tags['title'].values)

# Stack movie embeddings into a 2D array
X_sample = np.vstack(movies_genres_tags['movie_embedding'].values)

user_movie_ids = ratings_over_4.groupby('userId')['movieId'].unique()

#now we need to use the movieID to reference "movie_embedding" in my "movies_generes_tags",
user_embeddings = []
user_movies = []
for movie_ids in user_movie_ids:
    movie_embeddings = [movies_genres_tags.loc[movies_genres_tags['movieId'] == movie_id, 'movie_embedding'].values[0]
                        for movie_id in movie_ids]
    user_embedding = np.mean(movie_embeddings, axis=0)
    user_embeddings.append(user_embedding)

    movie_names = [movies_genres_tags.loc[movies_genres_tags['movieId'] == movie_id, 'title'].values[0]
                   for movie_id in movie_ids]
    user_movies.append(movie_names)

X_user = np.array(user_embeddings)



# Get user embeddings

# Create and fit the model with cosine similarity using 4 closest movies
knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(X_sample)


 #Find 4 movies similar to the query movie
recommendations = []
for test in X_user:
    distances, indices = knn.kneighbors([test])

    # Filter out movies that are already liked by the user
    similar_movies = [sample_movies[i] for i in indices[0] if sample_movies[i] not in user_movies]
    recommendations.append(similar_movies)

# Print the similar movies
for i, user_movie in enumerate(user_movies):
    print(f"Based on {user_movie}, we would recommend:")
    for recommended_movie in recommendations[i]:
        print(recommended_movie)
    print()
'''

'\nsample_movies = np.array(movies_genres_tags[\'title\'].values)\n\n# Stack movie embeddings into a 2D array\nX_sample = np.vstack(movies_genres_tags[\'movie_embedding\'].values)\n\nuser_movie_ids = ratings_over_4.groupby(\'userId\')[\'movieId\'].unique()\n\n#now we need to use the movieID to reference "movie_embedding" in my "movies_generes_tags",\nuser_embeddings = []\nuser_movies = []\nfor movie_ids in user_movie_ids:\n    movie_embeddings = [movies_genres_tags.loc[movies_genres_tags[\'movieId\'] == movie_id, \'movie_embedding\'].values[0]\n                        for movie_id in movie_ids]\n    user_embedding = np.mean(movie_embeddings, axis=0)\n    user_embeddings.append(user_embedding)\n\n    movie_names = [movies_genres_tags.loc[movies_genres_tags[\'movieId\'] == movie_id, \'title\'].values[0]\n                   for movie_id in movie_ids]\n    user_movies.append(movie_names)\n\nX_user = np.array(user_embeddings)\n\n\n\n# Get user embeddings\n\n# Create and fit the model with c