In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import datetime

In [3]:
# Cell 2: Load data
movies = pd.read_csv('../backend/data/processed/filtered_movies.csv')
ratings = pd.read_csv('../backend/data/processed/filtered_ratings.csv')

In [4]:
# Cell 3: Create features
def create_features(movies, ratings):
    # Filtrera movies för att endast inkludera filmer med betyg
    rated_movie_ids = set(ratings['movieId'].unique())
    movies_filtered = movies[movies['movieId'].isin(rated_movie_ids)]

    movies_filtered['year'] = movies_filtered['title'].str.extract('(\d{4})').astype(float)
    
    movie_stats = ratings.groupby('movieId').agg({'rating': ['count', 'mean']})
    movie_stats.columns = ['rating_count', 'rating_mean']
    
    genre_dummies = movies_filtered['genres'].str.get_dummies(sep='|')
    
    features = pd.concat([movies_filtered[['movieId', 'year']], movie_stats, genre_dummies], axis=1)
    
    current_year = datetime.now().year
    features['movie_age'] = current_year - features['year']
    
    return features

# Använd funktionen för att skapa features
movie_features = create_features(movies, ratings)

# Denna funktion skapar features för varje film, inklusive år, betygsstatistik,
# genre-information och filmens ålder. Dessa features används senare i rekommendationssystemet.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_filtered['year'] = movies_filtered['title'].str.extract('(\d{4})').astype(float)


In [5]:
# Cell 4: Preprocess data
scaler = MinMaxScaler()
numeric_features = ['year', 'rating_count', 'rating_mean', 'movie_age']
movie_features[numeric_features] = scaler.fit_transform(movie_features[numeric_features])

movie_features_indexed = movie_features.set_index('movieId').fillna(0)


In [6]:
# Cell 5: Implement item-based collaborative filtering
def item_similarity(movie_features):
    return cosine_similarity(movie_features)

item_sim = item_similarity(movie_features_indexed.drop(['year'], axis=1))


In [7]:
# Cell 6: Implement content-based filtering
def content_similarity(movie_features):
    return cosine_similarity(movie_features)

content_sim = content_similarity(movie_features_indexed.drop(['rating_count', 'rating_mean'], axis=1))


In [8]:
# Cell 7: Create user-item matrix
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print("user_item_matrix shape:", user_item_matrix.shape)

user_item_matrix shape: (610, 2269)


In [9]:
# Cell 8: Implement hybrid similarity
def hybrid_similarity(item_sim, content_sim, alpha=0.5):
    hybrid = alpha * item_sim + (1 - alpha) * content_sim
    return pd.DataFrame(hybrid, index=movie_features_indexed.index.astype('int64'), columns=movie_features_indexed.index.astype('int64'))

movie_features_indexed = movie_features_indexed.loc[np.isfinite(movie_features_indexed.index)]
movie_features_indexed.index = movie_features_indexed.index.astype('int64')

item_sim = item_similarity(movie_features_indexed.drop(['year'], axis=1))
content_sim = content_similarity(movie_features_indexed.drop(['rating_count', 'rating_mean'], axis=1))

hybrid_sim = hybrid_similarity(item_sim, content_sim)

common_movies = list(set(user_item_matrix.columns) & set(hybrid_sim.index))
hybrid_sim = hybrid_sim.loc[common_movies, common_movies]
user_item_matrix = user_item_matrix[common_movies]

print("hybrid_sim shape after alignment:", hybrid_sim.shape)
print("user_item_matrix shape after alignment:", user_item_matrix.shape)

print("\nuser_item_matrix index type:", user_item_matrix.columns.dtype)
print("hybrid_sim index type:", hybrid_sim.index.dtype)

hybrid_sim shape after alignment: (2235, 2235)
user_item_matrix shape after alignment: (610, 2235)

user_item_matrix index type: int64
hybrid_sim index type: int64


In [10]:
def predict_rating(user_id, movie_id, user_item_matrix, hybrid_sim):
    if movie_id not in user_item_matrix.columns:
        return None
    
    user_ratings = user_item_matrix.loc[user_id]
    sim_scores = hybrid_sim.loc[movie_id]
    
    relevant_sims = sim_scores[sim_scores > 0]
    relevant_ratings = user_ratings[user_ratings > 0]
    
    common_movies = list(set(relevant_sims.index) & set(relevant_ratings.index))
    
    if len(common_movies) == 0:
        return None
    
    weighted_sum = np.sum(relevant_ratings.loc[common_movies] * relevant_sims.loc[common_movies])
    sim_sum = np.sum(relevant_sims.loc[common_movies])
    
    if sim_sum == 0:
        return None
    
    predicted_rating = weighted_sum / sim_sum
    
    user_mean = np.mean(relevant_ratings)
    user_std = np.std(relevant_ratings)
    
    adjusted_prediction = user_mean + (predicted_rating - user_mean) * 1.5
    
    random_factor = np.random.normal(0, user_std * 0.2)
    final_prediction = adjusted_prediction + random_factor
    
    final_prediction = max(0.5, min(5, final_prediction))
    
    return final_prediction

In [11]:
# Kontrollera item_sim och content_sim
print("Item-based similarity matrix stats:")
print(f"Mean: {np.mean(item_sim):.4f}, Min: {np.min(item_sim):.4f}, Max: {np.max(item_sim):.4f}")

print("\nContent-based similarity matrix stats:")
print(f"Mean: {np.mean(content_sim):.4f}, Min: {np.min(content_sim):.4f}, Max: {np.max(content_sim):.4f}")

# Kontrollera hybrid_sim
print("\nHybrid similarity matrix stats:")
print(f"Mean: {np.mean(hybrid_sim.values):.4f}, Min: {np.min(hybrid_sim.values):.4f}, Max: {np.max(hybrid_sim.values):.4f}")

Item-based similarity matrix stats:
Mean: 0.3591, Min: 0.0000, Max: 1.0000

Content-based similarity matrix stats:
Mean: 0.3941, Min: 0.0000, Max: 1.0000

Hybrid similarity matrix stats:
Mean: 0.3766, Min: 0.0000, Max: 1.0000


In [12]:
def get_top_recommendations(user_id, user_item_matrix, sim_matrix, n=10):
    user_ratings = user_item_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index
    
    recommendations = []
    for movie_id in unrated_movies:
        if movie_id in sim_matrix.index:
            predicted_rating = predict_rating(user_id, movie_id, user_item_matrix, sim_matrix)
            if predicted_rating is not None:
                recommendations.append((movie_id, predicted_rating))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations[:n]

In [13]:
# Cell 11: Test the recommendation system
test_user = user_item_matrix.index[0]
recommendations = get_top_recommendations(test_user, user_item_matrix, hybrid_sim)

print(f"Top 10 movie recommendations for user {test_user}:")
for movie_id, predicted_rating in recommendations:
    movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
    print(f"{movie_title} (Predicted Rating: {predicted_rating:.2f})")

Top 10 movie recommendations for user 1:
20,000 Leagues Under the Sea (1954) (Predicted Rating: 5.00)
Coco (2017) (Predicted Rating: 4.97)
Hero (Ying xiong) (2002) (Predicted Rating: 4.96)
Simple Plan, A (1998) (Predicted Rating: 4.92)
When a Man Loves a Woman (1994) (Predicted Rating: 4.90)
Aristocats, The (1970) (Predicted Rating: 4.88)
Before Sunset (2004) (Predicted Rating: 4.88)
Whiplash (2014) (Predicted Rating: 4.87)
Forgetting Sarah Marshall (2008) (Predicted Rating: 4.84)
Star Trek Beyond (2016) (Predicted Rating: 4.84)


In [14]:
# Cell 12: Final diagnostics
print("Shapes:")
print("movie_features_indexed:", movie_features_indexed.shape)
print("user_item_matrix:", user_item_matrix.shape)
print("hybrid_sim:", hybrid_sim.shape)

print("\nIndex types:")
print("movie_features_indexed:", movie_features_indexed.index.dtype)
print("user_item_matrix columns:", user_item_matrix.columns.dtype)
print("hybrid_sim:", hybrid_sim.index.dtype)

print("\nCommon movies:")
print("Between user_item_matrix and hybrid_sim:", 
      len(set(user_item_matrix.columns) & set(hybrid_sim.index)))

print("\nSample recommendations:")
test_user = user_item_matrix.index[0]
recommendations = get_top_recommendations(test_user, user_item_matrix, hybrid_sim, n=5)
for movie_id, predicted_rating in recommendations:
    movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
    print(f"{movie_title} (Predicted Rating: {predicted_rating:.2f})")

Shapes:
movie_features_indexed: (2235, 23)
user_item_matrix: (610, 2235)
hybrid_sim: (2235, 2235)

Index types:
movie_features_indexed: int64
user_item_matrix columns: int64
hybrid_sim: int64

Common movies:
Between user_item_matrix and hybrid_sim: 2235

Sample recommendations:
Clockers (1995) (Predicted Rating: 4.90)
Star Trek: Generations (1994) (Predicted Rating: 4.89)
Kiki's Delivery Service (Majo no takkyûbin) (1989) (Predicted Rating: 4.88)
Cinderella (1950) (Predicted Rating: 4.88)
Bronx Tale, A (1993) (Predicted Rating: 4.87)


In [15]:
# Cell 12: Verify shapes and number of unique movies
print("movie_features_indexed shape:", movie_features_indexed.shape)
print("hybrid_sim shape:", hybrid_sim.shape)
print("user_item_matrix shape:", user_item_matrix.shape)
print("Number of unique movieIds in ratings:", ratings['movieId'].nunique())
print("Number of unique movieIds in movies:", movies['movieId'].nunique())


movie_features_indexed shape: (2235, 23)
hybrid_sim shape: (2235, 2235)
user_item_matrix shape: (610, 2235)
Number of unique movieIds in ratings: 2269
Number of unique movieIds in movies: 9611


In [16]:
# Cell 13: Check for missing movies
ratings_movies = set(ratings['movieId'].unique())
movies_movies = set(movies['movieId'].unique())
missing_movies = ratings_movies - movies_movies
if missing_movies:
    print(f"\nAntal filmer i ratings som inte finns i movies: {len(missing_movies)}")
    print("Exempel på saknade film-IDs:", list(missing_movies)[:5])
else:
    print("\nAlla filmer i ratings finns i movies datasetet.")

print(f"Antal unika filmer i ratings: {len(ratings_movies)}")
print(f"Antal unika filmer i movies: {len(movies_movies)}")



Antal filmer i ratings som inte finns i movies: 34
Exempel på saknade film-IDs: [108932, 60684, 69644, 83349, 91542]
Antal unika filmer i ratings: 2269
Antal unika filmer i movies: 9611


In [17]:
# Cell 14: Identify movies without ratings
movies_without_ratings = set(movies['movieId']) - set(ratings['movieId'])
movies_without_ratings_df = movies[movies['movieId'].isin(movies_without_ratings)]

print("\nFilmer utan betyg:")
print(movies_without_ratings_df[['movieId', 'title', 'genres']])
print(f"\nAntal filmer utan betyg: {len(movies_without_ratings)}")



Filmer utan betyg:
      movieId                                              title   
3           4                           Waiting to Exhale (1995)  \
7           8                                Tom and Huck (1995)   
12         13                                       Balto (1995)   
26         27                                Now and Then (1995)   
29         30  Shanghai Triad (Yao a yao yao dao waipo qiao) ...   
...       ...                                                ...   
9606   193581          Black Butler: Book of the Atlantic (2017)   
9607   193583                       No Game No Life: Zero (2017)   
9608   193585                                       Flint (2017)   
9609   193587                Bungo Stray Dogs: Dead Apple (2018)   
9610   193609                Andrew Dice Clay: Dice Rules (1991)   

                               genres  
3                Comedy|Drama|Romance  
7                  Adventure|Children  
12       Adventure|Animation|Children  
26 

In [18]:
# Cell 15: "Implement strategy for new users
def handle_new_user(movies_df, ratings_df, n=10):
    popular_movies = ratings_df.groupby('movieId').agg({'rating': ['count', 'mean']})
    popular_movies.columns = ['rating_count', 'rating_mean']
    popular_movies = popular_movies.sort_values('rating_count', ascending=False)
    
    top_movies = popular_movies.head(n).index
    recommendations = []
    for movie_id in top_movies:
        title = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
        recommendations.append((movie_id, title, popular_movies.loc[movie_id, 'rating_mean']))
    
    return recommendations

# Testa funktionen för nya användare
new_user_recommendations = handle_new_user(movies, ratings)
print("Recommendations for new user:")
for movie_id, title, avg_rating in new_user_recommendations:
    print(f"{title} (Average Rating: {avg_rating:.2f})")

Recommendations for new user:
Forrest Gump (1994) (Average Rating: 4.16)
Shawshank Redemption, The (1994) (Average Rating: 4.43)
Pulp Fiction (1994) (Average Rating: 4.20)
Silence of the Lambs, The (1991) (Average Rating: 4.16)
Matrix, The (1999) (Average Rating: 4.19)
Star Wars: Episode IV - A New Hope (1977) (Average Rating: 4.23)
Jurassic Park (1993) (Average Rating: 3.75)
Braveheart (1995) (Average Rating: 4.03)
Terminator 2: Judgment Day (1991) (Average Rating: 3.97)
Schindler's List (1993) (Average Rating: 4.22)
