In [1]:
"""
Movie recommendation system based on genres.
"""

'\nMovie recommendation system based on genres.\n'

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# Load movie dataset (Nomainīts, jo dati autoram glabājās lokāli)
movies = pd.read_csv("C:\Bakalaurs_praktiskais\Bakalaura-darbs\movies.csv")
ratings = pd.read_csv(r"C:\Bakalaurs_praktiskais\Bakalaura-darbs\ratings.csv")

In [4]:
# Data preprocessing
movies['genres'] = movies['genres'].apply(lambda x: x.lower().replace('|', ' '))

In [5]:
# Create TF-IDF vectorizer and fit the movie dataset
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['genres'])

In [6]:
#Calculate similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
def find_movie_index(movie_title, year):
    if year != None:
        movie_title = f"{movie_title} ({year})"
        if movie_title not in movies['title'].values:
            return None
        return movies[movies['title'] == movie_title].index[0]
    else:
        if not any(movies['title'].str.contains(movie_title)):
            return None
        return movies[movies['title'].str.contains(movie_title)].index[0]

In [8]:
def recommend_movies(movie_title, year=None, n_recommendations=5):
    movie_index = find_movie_index(movie_title, year)
    if movie_index is None:
        movie_not_found_message = f"Movie  '{movie_title}' not found in dataset."
        print(movie_not_found_message)
        return None

    sim_scores = list(enumerate(cosine_sim_matrix[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n_recommendations + 1]

    recommended_movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[recommended_movie_indices]

In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score
import random

def evaluate_genre_recommender(movies, ratings, recommend_function, n_recommendations=5, top_n_users=20):
  

    # Apvieno filmu vērtējumus ar filmām
    data = pd.merge(ratings, movies, on='movieId')

    # Izvēlās tikai aktīvākos lietotājus
    top_users = data['userId'].value_counts().head(top_n_users).index.tolist()

    # Izveido tukšu sarakstu priekš metrikām
    all_precisions, all_recalls, all_f1s = [], [], []

    for user_id in top_users:
        user_ratings = data[data['userId'] == user_id]

        # Norāda vērtējumu robežu, kuru uzskatīt par positīvi novērtētu filmu
        liked_movies = user_ratings[user_ratings['rating'] >= 3.5]

        # Izlaiž lietotājus, kuri ir vērtējuši mazāk par 3 filmām
        if len(liked_movies) < 2:
            continue 

        # Uz nejaušību izvēlās vienu filmu, kura ir vērtēta pozitīvi no ietiekumiem
        seed_row = liked_movies.sample(1).iloc[0]
        seed_title = seed_row['title']

        # Apstrādā datus tā, lai nosaukumā netiek iekļauts gads "Title (Year)"
        try:
            if seed_title.strip()[-1] == ')':
                year = int(seed_title.strip()[-5:-1])
                title = seed_title[:-7]  # Noņem gadu no nosaukuma " (YYYY)"
            else:
                year = None
                title = seed_title # Ja noņemšana nestrādā, tiek izmantots pilnais gads
        except:
            year = None
            title = seed_title

        # Ģenerē ieteikumus, balstoties uz "seed" filmu
        recs = recommend_function(title, year, n_recommendations=n_recommendations)
        if recs is None:
            continue

        # Ieteikumu saraksts, ko atgriež ieteikumu sistēma
        recommended_titles = set(recs['title'].tolist())
        actual_liked_titles = set(liked_movies['title'].tolist()) - {seed_title}

        # Salīdzina filmas, kuras lietotājs ir novērtējis ar labu vērtējumu pret prognozētajām filmām
        y_true = [1 if title in actual_liked_titles else 0 for title in recommended_titles]
        y_pred = [1] * len(y_true)

        if not y_true:
            continue

        # Aprēķina precizitāti Precision = (sum(y_true)/len(y_pred))
        precision = precision_score(y_true, y_pred, zero_division=0)

        # Aprēķina atsaukumu Recall = (sum(y_true)/len(kopējais filmu skaits, kurš lietotājam ir novērtēts ar pozitīvu vērtējumu))
        recall = recall_score(y_true, y_pred, zero_division=0)
        
        # Aprēķina F1 = 2*((Precision*Recall)/(Precision+Recall))
        f1 = f1_score(y_true, y_pred, zero_division=0)

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

    print("\n--- Novērtējums ---")
    print(f"Novērtēto lietotāju skaits: {len(all_precisions)}")
    print(f"Vidējā Precizitāte@{n_recommendations}: {np.mean(all_precisions):.4f}")
    print(f"Videājais Atsaukums@{n_recommendations}: {np.mean(all_recalls):.4f}")
    print(f"Vidējais F1@{n_recommendations}: {np.mean(all_f1s):.4f}")

In [None]:
if __name__ == "__main__":
    year = 1995
    movie_title = "Toy Story"
    n_recommendations = 10
    print(f"Similar recommendations for movie '{movie_title}':")

    recommendations = recommend_movies(movie_title, year, n_recommendations)
    if recommendations is not None:
        print(recommendations)
        
    # Novērtējuma funkcijas izsaukums
    evaluate_genre_recommender(movies, ratings, recommend_movies, n_recommendations=10, top_n_users=10)

Similar recommendations for movie 'Toy Story':
      movieId                                              title  \
1706     2294                                        Antz (1998)   
2355     3114                                 Toy Story 2 (1999)   
2809     3754     Adventures of Rocky and Bullwinkle, The (2000)   
3000     4016                   Emperor's New Groove, The (2000)   
3568     4886                              Monsters, Inc. (2001)   
6194    45074                                   Wild, The (2006)   
6486    53121                             Shrek the Third (2007)   
6948    65577                     Tale of Despereaux, The (2008)   
7760    91355  Asterix and the Vikings (Astérix et les Viking...   
8219   103755                                       Turbo (2013)   

                                           genres  
1706  Adventure|Animation|Children|Comedy|Fantasy  
2355  Adventure|Animation|Children|Comedy|Fantasy  
2809  Adventure|Animation|Children|Comedy|Fantas