In [23]:
"""
Movie recommendation system based on genres.
"""

'\nMovie recommendation system based on genres.\n'

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# Load movie dataset (Nomainīts, jo dati autoram glabājās lokāli)
movies = pd.read_csv("C:\Bakalaurs_praktiskais\Bakalaura-darbs\movies.csv")
ratings = pd.read_csv(r"C:\Bakalaurs_praktiskais\Bakalaura-darbs\ratings.csv")

In [26]:
# Data preprocessing
movies['genres'] = movies['genres'].apply(lambda x: x.lower().replace('|', ' '))

In [27]:
# Create TF-IDF vectorizer and fit the movie dataset
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['genres'])

In [28]:
#Calculate similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [29]:
def find_movie_index(movie_title, year):
    if year != None:
        movie_title = f"{movie_title} ({year})"
        if movie_title not in movies['title'].values:
            return None
        return movies[movies['title'] == movie_title].index[0]
    else:
        if not any(movies['title'].str.contains(movie_title)):
            return None
        return movies[movies['title'].str.contains(movie_title)].index[0]

In [30]:
def recommend_movies(movie_title, year=None, n_recommendations=5):
    movie_index = find_movie_index(movie_title, year)
    if movie_index is None:
        movie_not_found_message = f"Movie  '{movie_title}' not found in dataset."
        print(movie_not_found_message)
        return None

    sim_scores = list(enumerate(cosine_sim_matrix[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n_recommendations + 1]

    recommended_movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[recommended_movie_indices]

In [31]:
import numpy as np
import pandas as pd

def evaluate(movies, ratings, recommend_function, n_recommendations=5, top_n_users=20):
    # Apvieno filmu vērtējumus ar filmām
    data = pd.merge(ratings, movies, on='movieId')
     # Izvēlās tikai aktīvākos lietotājus
    top_users = data['userId'].value_counts().head(top_n_users).index.tolist()
     # Izveido tukšus sarakstus priekš metrikām
    all_precisions, all_recalls, all_f1s = [], [], []
    user_metrics = []

    # Norāda vērtējumu robežu, kuru uzskatīt par pozitīvi novērtētu filmu
    for user_id in top_users:
        user_ratings = data[data['userId'] == user_id]
        liked_movies = user_ratings[user_ratings['rating'] >= 3.5]
    # Izlaiž lietotājus, kuri ir vērtējuši mazāk par 5 filmām
        if len(liked_movies) < 5:
            continue
    # Uz nejaušību izvēlās vienu filmu, kura ir vērtēta pozitīvi no ietiekumiem
        seed_row = liked_movies.sample(1).iloc[0]
        seed_title = seed_row['title']
     # Apstrādā datus tā, lai nosaukumā netiek iekļauts gads "Title (Year)"
        try:
            if seed_title.strip()[-1] == ')':
                year = int(seed_title.strip()[-5:-1])
                title = seed_title[:-7] # Noņem gadu no nosaukuma " (YYYY)"
            else:
                year = None
                title = seed_title
        except:
            year = None
            title = seed_title
        # Ģenerē ieteikumus, balstoties uz "seed" filmu
        recs = recommend_function(title, year, n_recommendations=n_recommendations)
        if recs is None:
            continue
        # Ieteikumu saraksts, ko atgriež ieteikumu sistēma
        recommended_titles = set(recs['title'].tolist())
        actual_liked_titles = set(liked_movies['title'].tolist()) - {seed_title}
        # Aprēķina atbilstošas filmas, kuras ir ieteiktas
        hits = len(recommended_titles & actual_liked_titles)
        # Aprēķina precizitāti, atsaikumu un f1 mērījumu
        precision = hits / len(recommended_titles) if recommended_titles else 0
        recall = hits / len(actual_liked_titles) if actual_liked_titles else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

        user_metrics.append({
            'userId': user_id,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'relevant_items': len(actual_liked_titles)
        })
   
    print("\n--- Novērtējums ---")
    print(f"Novērtēto lietotāju skaits: {len(all_precisions)}")
    print(f"Vidējā Precizitāte@{n_recommendations}: {np.mean(all_precisions):.4f}")
    print(f"Vidējais Atsaukums@{n_recommendations}: {np.mean(all_recalls):.4f}")
    print(f"Vidējais F1@{n_recommendations}: {np.mean(all_f1s):.4f}")

    # Izprintē tabulu ar lietotāju metrikām
    print("\n--- Detalizēta lietotāju statistika ---")
    print(f"{'LietotājaId':<10} {'Precizitāte':>10} {'Atsaukums':>10} {'F1':>10} {'Rel. vienumu skaits':>20}")
    print(f"{'-'*70}")
    
    user_metrics.sort(key=lambda x: x['userId'])
    for m in user_metrics:
        print(f"{m['userId']:<10} {m['precision']:>10.4f} {m['recall']:>10.4f} {m['f1']:>10.4f} {m['relevant_items']:>20}")
    # Atgriež visas metrikas kā vārdnīcu
    return {
        'precision': np.mean(all_precisions),
        'recall': np.mean(all_recalls),
        'f1': np.mean(all_f1s),
        'lietotāja_metrikas': user_metrics
    }


In [None]:
if __name__ == "__main__":
    import numpy as np

    user_counts = [10, 20, 50]
    recommendation_counts = [5, 20, 50]
    num_iterations = 5

    for top_n_users in user_counts:
        for n_recommendations in recommendation_counts:
            print(f"\n===> Evaluating with top_n_users={top_n_users}, n_recommendations={n_recommendations}")
            precisions, recalls, f1s = [], [], []

            for i in range(num_iterations):
                results = evaluate(movies, ratings, recommend_movies,
                                   n_recommendations=n_recommendations,
                                   top_n_users=top_n_users)
                
                precisions.append(results['precision'])
                recalls.append(results['recall'])
                f1s.append(results['f1'])

                print(f"[Iter {i+1}] Precision={results['precision']:.4f}, Recall={results['recall']:.4f}, F1={results['f1']:.4f}")

            print(f"\n>>> Average results after {num_iterations} runs:")
            print(f"Precision: {np.mean(precisions):.4f}")
            print(f"Recall:    {np.mean(recalls):.4f}")
            print(f"F1 Score:  {np.mean(f1s):.4f}")



===> Evaluating with top_n_users=10, n_recommendations=5

--- Novērtējums ---
Novērtēto lietotāju skaits: 10
Vidējā Precizitāte@5: 0.2000
Vidējais Atsaukums@5: 0.0014
Vidējais F1@5: 0.0028

--- Detalizēta lietotāju statistika ---
LietotājaId Precizitāte  Atsaukums         F1  Rel. vienumu skaits
----------------------------------------------------------------------
68             0.6000     0.0048     0.0095                  629
274            0.2000     0.0013     0.0025                  788
288            0.0000     0.0000     0.0000                  424
380            0.0000     0.0000     0.0000                  693
414            0.2000     0.0007     0.0014                 1458
448            0.2000     0.0017     0.0034                  592
474            0.4000     0.0015     0.0029                 1366
599            0.4000     0.0042     0.0083                  479
606            0.0000     0.0000     0.0000                  853
610            0.0000     0.0000     0.0000   