In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

In [2]:
movies = pd.read_csv('/home/pokji/vscode-projects/uni/information_search/data/movieapp_movie.csv')
movies.head(2)

Unnamed: 0,id,movie_id,languages,releaseDate,directors,runtime,title,mpaa,actors,originalTitle,genres,plotSummary,avgRating,releaseYear,poster_path
0,45844,1,English,1995.0,John Lasseter,81,Toy Story,G,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Toy Story,"Animation, Comedy, Family","Led by Woody, Andy's toys live happily in his ...",3.9,1995.0,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,45845,2,"English, Français",1995.0,Joe Johnston,104,Jumanji,PG,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Jumanji,"Adventure, Fantasy, Family",When siblings Judy and Peter discover an encha...,3.2,1995.0,/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg


In [30]:
# Convert the string list columns from string to pure list
movies['actors'] = movies['actors'].apply(lambda x: x.split(','))

movies['directors'] = movies['directors'].apply(lambda x: x.split(','))

movies['languages'] = movies['languages'].apply(lambda x: x.split(','))

movies['genres'] = movies['genres'].fillna('')
movies['genres'] = movies['genres'].apply(lambda x: x.split(','))

### Handle NaN values

In [31]:
# Fill the NaN in releaseYear by taking the median
median_year = movies['releaseYear'].median()
movies['releaseYear'] = movies['releaseYear'].fillna(median_year)

In [32]:
# Map G to PG, Eveything else to R
mpaa_map = {
    'G': 'PG',
    'PG': 'PG'
}

movies['mpaa'] = movies['mpaa'].map(lambda x: mpaa_map.get(x, 'R'))

In [33]:
# Fill the NaN based on Genre
def fast_mpaa_impute(row):
    if pd.isna(row['mpaa']) or row['mpaa'] == '':
        genres = set([g.strip().lower() for g in row['genres']])
        if 'family' in genres or 'animation' in genres:
            return 'PG'
        elif 'adventure' in genres or 'comedy' in genres:
            return 'PG'
        else:
            return 'R'
    return row['mpaa']

movies['mpaa'] = movies.apply(fast_mpaa_impute, axis=1)

### Keep only the most popular actors

In [34]:
# Count each actor occurences
actors_occurences = []
for actor in movies['actors']:
    # actors_occurences.extend(actor)
    actors_occurences.extend(actor)

# Drop out every actor with less than 30 occurences
counts = Counter(actors_occurences)
counts = pd.DataFrame(counts.items(), columns=['actors', 'count'])
counts.sort_values(by='count', ascending=False, inplace=True)
counts.drop(counts[counts['count'] < 20].index, inplace=True)  

# Drop the [''] and Jr. columns
counts.drop(counts.index[[0, 1]], inplace=True)

# Actors to keep
popular_actors = counts['actors'].tolist()

# Keep only the most popular actors in the movies dataframe
movies['actors'] = movies['actors'].apply(lambda actor_list: [a for a in actor_list if a in popular_actors])

### Keep only the most popular directors

In [35]:
# Count director occurrences
director_counts = Counter([d for director_list in movies['directors'] for d in director_list])
popular_directors = [d for d, count in director_counts.items() if count > 1]

# Keep only popular directors in the dataframe
movies['directors'] = movies['directors'].apply(lambda dlist: [d for d in dlist if d in popular_directors])

### One-hot encode the list columns

In [37]:
from sklearn.preprocessing import MultiLabelBinarizer, normalize

# One-hot encode each list column
mlb_actors = MultiLabelBinarizer()
actors_ohe = mlb_actors.fit_transform(movies['actors'])

mlb_directors = MultiLabelBinarizer()
directors_ohe = mlb_directors.fit_transform(movies['directors'])

mlb_languages = MultiLabelBinarizer()
languages_ohe = mlb_languages.fit_transform(movies['languages'])

mlb_genres = MultiLabelBinarizer()
genres_ohe = mlb_genres.fit_transform(movies['genres'])

mlb_mpaa = MultiLabelBinarizer()
mpaa_ohe = mlb_mpaa.fit_transform(movies['mpaa'])

# L2 normalize each one-hot encoded matrix (row-wise)
actors_ohe_norm = normalize(actors_ohe, norm='l2', axis=1)
directors_ohe_norm = normalize(directors_ohe, norm='l2', axis=1)
languages_ohe_norm = normalize(languages_ohe, norm='l2', axis=1)
genres_ohe_norm = normalize(genres_ohe, norm='l2', axis=1)

### Scale the numerical columns

In [38]:
from sklearn.preprocessing import MinMaxScaler

numerical_columns = movies[['releaseYear', 'runtime']]
numerical_columns

# Scale to [0, 1]
scaler = MinMaxScaler()
num_features_scaled = scaler.fit_transform(numerical_columns)

### Concatenate all features

In [39]:
# Concatenate with your normalized one-hot features
all_features = np.hstack([
    actors_ohe_norm,
    directors_ohe_norm,
    languages_ohe_norm,
    genres_ohe_norm,
    mpaa_ohe,
    num_features_scaled
])

In [40]:
np.save('all_features.npy', all_features)

### Compute cosine similarity

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

movie_idx = 24
movie_name = movies.iloc[movie_idx]['title']

target_vector = all_features[movie_idx].reshape(1, -1)

similarities = cosine_similarity(target_vector, all_features).flatten()

# Get top 5 most similar movies
most_similar_indices = similarities.argsort()[::-1][1: 6] 

### Recommend movies

In [42]:
# Take into account the rating of the movie
ratings = movies.iloc[most_similar_indices]['avgRating']
ratings_scaled = MinMaxScaler().fit_transform(ratings.values.reshape(-1, 1)).flatten()

# Combine similarity and rating
sim_weight = 0.7
rating_weight = 0.3
combined_score = (similarities[most_similar_indices] * sim_weight) + (ratings_scaled * rating_weight)

sorted_idx = np.argsort(combined_score)[::-1]

recommended = movies.iloc[most_similar_indices].iloc[sorted_idx]
print(recommended[['title', 'releaseYear', 'avgRating']])

                 title  releaseYear  avgRating
2523          Get Real       1998.0        3.8
32696   Get Your Stuff       2000.0        3.8
3485   Defying Gravity       1999.0        3.4
494          Mr. Jones       1993.0        2.8
15743       Like It Is       1998.0        1.8
