In [None]:
import numpy as np
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Funkcija, kas ielādē datus
def load_data():
    # ratings.csv ielāde
    ratings_df = pd.read_csv(r"C:\Bakalaurs_praktiskais\Bakalaura-darbs\ratings.csv")
    # Pārbauda kolonas
    if 'userId' in ratings_df.columns:
        ratings_df = ratings_df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
    
    # movies.csv ielāde
    movies_df = pd.read_csv(r"C:\Bakalaurs_praktiskais\Bakalaura-darbs\movies.csv")
    if 'movieId' in movies_df.columns:
        movies_df = movies_df.rename(columns={'movieId': 'movie_id'})
    
    # Žanru apstrāde priekš TF-IDF
    movies_df['genres'] = movies_df['genres'].apply(lambda x: x.replace('|', ' '))
    
    print(f"Loaded data: {len(ratings_df)} ratings and {len(movies_df)} movies")
    return ratings_df, movies_df

In [None]:
# Matricu faktorizācijas modelis no Suprise bibliotēkas
class SVDModel:
    def __init__(self, n_factors=200, n_epochs=50, lr_all=0.001, reg_all=0.01):
        self.model = SVD(n_factors=n_factors, n_epochs=n_epochs, 
                         lr_all=lr_all, reg_all=reg_all)
        self.trainset = None
        self.testset = None
    # Sadala datus testēšanas un apmačīšanas datos 
    def prepare_data(self, ratings_df):
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)
        self.trainset, self.testset = train_test_split(data, test_size=0.2, random_state=42)
        return self.trainset, self.testset
    # Apmāca modeli uz apmācīšanas kopas
    def train(self, trainset=None):
        if trainset is not None:
            self.trainset = trainset
        self.model.fit(self.trainset)
    # Novērtē modeli uz apmācīšanas kopas
    def evaluate(self, testset=None):
        if testset is not None:
            self.testset = testset
        predictions = self.model.test(self.testset)
        return predictions
    # Atgriež RMSE and MAE
    def cross_validate(self, data, cv=5):
        return cross_validate(self.model, data, measures=['RMSE', 'MAE'], cv=cv, verbose=True)
    # Prognozē lietotāja vērtējumu par vienumu
    def predict_rating(self, user_id, item_id):
        return self.model.predict(user_id, item_id).est

In [None]:
# TF-IDF modelis
class TFIDFModel:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = None
        self.movies_df = None
        self.cosine_sim = None
    # Pielāgo modeli filmu žanriem, izveidojot tf-idf matricu un aprēķinot kosinusa līdzību satrp filmām
    def fit(self, movies_df):
        self.movies_df = movies_df
        self.tfidf_matrix = self.vectorizer.fit_transform(movies_df['genres'])
        self.cosine_sim = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix)
        return self.cosine_sim
    # Atgriež sarakstu ar filmām, balstoties uz kosinusa līdzības koeficientiem
    def get_recommendations(self, movie_id, top_n=10):
        movie_idx = self.movies_df[self.movies_df['movie_id'] == movie_id].index[0]
        sim_scores = list(enumerate(self.cosine_sim[movie_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]  
        movie_indices = [i[0] for i in sim_scores]
        return self.movies_df.iloc[movie_indices]

**Jaunais eval**


In [None]:
# Svērtais hibrīdais modelis (metode), kura apvieno TF-IDF vektorizācijas algoritmu un matricu faktorizāciju
class HybridRecommender:
    def __init__(self, svd_weight=0.5):
        self.svd_model = SVDModel()
        self.tfidf_model = TFIDFModel()
        self.svd_weight = svd_weight
        self.ratings_df = None
        self.movies_df = None
    
    def train(self, ratings_df, movies_df):
        self.ratings_df = ratings_df
        self.movies_df = movies_df

        # Apmāca matricu faktorizācijas modeli
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)
        self.svd_model.prepare_data(ratings_df)
        self.svd_model.train()

        # Apmača TF-IDF modeli
        self.tfidf_model.fit(movies_df)

        return self
     # Funkcijas, kas veido hibrīdās metodes prognozētos vērtējumus
    def get_movie_prediction(self, user_id, movie_id, user_high_rated=None, min_rating=3.5):
        svd_pred = self.svd_model.predict_rating(user_id, movie_id)

        if movie_id in self.movies_df['movie_id'].values:
            movie_idx = self.movies_df[self.movies_df['movie_id'] == movie_id].index[0]
            cb_scores = self.tfidf_model.cosine_sim[movie_idx]

            if user_high_rated is None:
                user_high_rated = self.ratings_df[
                    (self.ratings_df['user_id'] == user_id) &
                    (self.ratings_df['rating'] >= min_rating)
                ]

            if not user_high_rated.empty:
                cb_sum = 0
                count = 0

                for _, row in user_high_rated.iterrows():
                    rated_movie_id = row['item_id']
                    if rated_movie_id in self.movies_df['movie_id'].values:
                        rated_idx = self.movies_df[self.movies_df['movie_id'] == rated_movie_id].index[0]
                        cb_sum += cb_scores[rated_idx]
                        count += 1
                # Hibrīdās metodes vērtējumu prognozēšana, izmantojot svarus priekš katra no metožu iegūtajiem vērtējumiem
                cb_score = cb_sum / count if count > 0 else 0
                cb_score_scaled = cb_score*10
                hybrid_pred = (
                    self.svd_weight * svd_pred +
                    (1 - self.svd_weight) * cb_score_scaled
                )

                return hybrid_pred, svd_pred, cb_score_scaled

        return svd_pred, svd_pred, 0
    
    # Atgriež top-n sarakstu ar filmām, kuras lietotājs nav vērtējis, balstoties uz hibrīda prognozēto vērtējumu
    def recommend(self, user_id, top_n=10, min_rating=3.0, cf_candidates=5000, user_rating_threshold=3.0): #Pielikts klāt
        user_rated_movies = set(self.ratings_df[self.ratings_df['user_id'] == user_id]['item_id'])
        all_movies = set(self.movies_df['movie_id'])
        unrated_movies = list(all_movies - user_rated_movies)

        if not unrated_movies:
            return pd.DataFrame()

        user_high_rated = self.ratings_df[
            (self.ratings_df['user_id'] == user_id) &
            (self.ratings_df['rating'] >= user_rating_threshold)  # Bija min_rating
        ]
        # Apstrādā aukstās palaišanas gadījumu, tādā gadījumā pielietojot tikai matricu faktorizāciju
        if user_high_rated.empty:
            svd_predictions = {
                movie_id: self.svd_model.predict_rating(user_id, movie_id)
                for movie_id in unrated_movies
            }
            top_movies = sorted(svd_predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]

            result = []
            for movie_id, pred in top_movies:
                if movie_id in self.movies_df['movie_id'].values:
                    movie_info = self.movies_df[self.movies_df['movie_id'] == movie_id].iloc[0]
                    result.append({
                        'movie_id': movie_id,
                        'title': movie_info['title'],
                        'hybrid_score': pred,
                        'cf_score': pred,
                        'cb_score': 0
                    })
            return pd.DataFrame(result)

        svd_predictions = {
            movie_id: self.svd_model.predict_rating(user_id, movie_id)
            for movie_id in unrated_movies
        }

        cf_recommendations = sorted(svd_predictions.items(), key=lambda x: x[1], reverse=True)[:cf_candidates]

        hybrid_scores = []
        for movie_id, cf_score in cf_recommendations:
            hybrid_pred, svd_pred, cb_score = self.get_movie_prediction(
                user_id, movie_id, user_high_rated, min_rating
            )

            if hybrid_pred > 0:
                hybrid_scores.append((movie_id, hybrid_pred, svd_pred, cb_score))

        recommendations = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:top_n]

        result = []
        for movie_id, hybrid_score, cf_score, cb_score in recommendations:
            movie_info = self.movies_df[self.movies_df['movie_id'] == movie_id].iloc[0]
            result.append({
                'movie_id': movie_id,
                'title': movie_info['title'],
                'hybrid_score': hybrid_score,
                'cf_score': cf_score,
                'cb_score': cb_score
            })

        return pd.DataFrame(result)

    # Veic novērtēšanas metriku mērījumus priekš hybrīda (Precizitāte, Atsaukums, F1, MAE, RMSE)
    def evaluate_hybrid(self, test_users=20, top_n=100, train_ratio=0.8, liked_threshold=3.0):
        users_with_min_ratings = self.ratings_df['user_id'].value_counts()
        qualified_users = users_with_min_ratings[users_with_min_ratings >= 300].index.tolist()

        if len(qualified_users) == 0:
            return pd.DataFrame({"error": ["Nav atrasts neviens lietotājs ar pietiekamu skaitu vērtējumu"]})

        test_users = np.random.choice(
            qualified_users,
            min(test_users, len(qualified_users)),
            replace=False
        )

        results = {
            'user_id': [],
            'num_ratings': [],
            'num_liked_test': [],
            'precision': [],
            'recall': [],
            'f1_score': [],
            'hit_rate': [],
            'mae': [],
            'rmse': []
        }

        for user_id in test_users:
            user_data = self.ratings_df[self.ratings_df['user_id'] == user_id]
            num_ratings = len(user_data)

            if num_ratings < 300:
                continue
            # Sadala datus apmācīšanas un testēšanas datos
            train_data = user_data.sample(frac=train_ratio, random_state=42)
            test_data = user_data.drop(train_data.index)

            actual_liked = set(test_data[test_data['rating'] >= liked_threshold]['item_id'])
            num_liked_test = len(actual_liked)

            if num_liked_test == 0:
                continue

            temp_ratings = pd.concat([
                self.ratings_df[self.ratings_df['user_id'] != user_id],
                train_data
            ])

            temp_hybrid = HybridRecommender(svd_weight=self.svd_weight)
            temp_hybrid.train(temp_ratings, self.movies_df)
            # Veido ieteikumus, izmantojot iepriekš uzrakstīto "recommend" funkciju
            hybrid_recs = temp_hybrid.recommend(  
                user_id,
                top_n=top_n,
                min_rating=liked_threshold, #min_rating=liked_threshold NOMAINĪTS
                cf_candidates=min(60, len(self.movies_df) - len(user_data))
            )

            if hybrid_recs.empty:
                continue
            # Aprēķina precizitāti, atsaukumu, F1 un Hit rate
            hybrid_recs_set = set(hybrid_recs['movie_id'])

            true_positives = len(hybrid_recs_set.intersection(actual_liked))
            precision = true_positives / len(hybrid_recs_set) if hybrid_recs_set else 0
            recall = true_positives / num_liked_test if num_liked_test > 0 else 0
            f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            hit_rate = 1 if true_positives > 0 else 0
            
            # Aprēķina MAE un RMSE
            mae_sum = 0
            rmse_sum = 0
            count = 0

            user_high_rated = train_data[train_data['rating'] >= liked_threshold]

            for _, row in test_data.iterrows():
                movie_id = row['item_id']
                actual_rating = row['rating']

                hybrid_pred, _, _ = temp_hybrid.get_movie_prediction(
                    user_id, movie_id, user_high_rated, min_rating=liked_threshold
                )

                if hybrid_pred > 0:
                    error = abs(actual_rating - hybrid_pred)
                    squared_error = error ** 2
                    mae_sum += error
                    rmse_sum += squared_error
                    count += 1

            mae = mae_sum / count if count > 0 else float('inf')
            rmse = np.sqrt(rmse_sum / count) if count > 0 else float('inf')
            # Atgriež rezultātus
            results['user_id'].append(user_id)
            results['num_ratings'].append(num_ratings)
            results['num_liked_test'].append(num_liked_test)
            results['precision'].append(precision)
            results['recall'].append(recall)
            results['f1_score'].append(f1)
            results['hit_rate'].append(hit_rate)
            results['mae'].append(mae)
            results['rmse'].append(rmse)

        return pd.DataFrame(results)
    
    # Ieteikumu veidošana, izmantojot testa kopu, tāpat kā to dara "evaluate_hybrid" funkcijā
    def recommend_on_test_set(self, user_id, train_ratio=0.8, min_rating=0, cf_candidates=100):
        user_data = self.ratings_df[self.ratings_df['user_id'] == user_id]

        if user_data.empty or len(user_data) < 5:
            return pd.DataFrame({"error": ["Nav atrasti pietiekami daudz lietotāji ar pietiekamu skaitu vērtējumiem"]})

        train_data = user_data.sample(frac=train_ratio, random_state=42)
        test_data = user_data.drop(train_data.index)

        if test_data.empty:
            return pd.DataFrame({"error": ["Testa kopa ir tukša"]})

        temp_ratings = pd.concat([
            self.ratings_df[self.ratings_df['user_id'] != user_id],
            train_data
        ])

        temp_hybrid = HybridRecommender(svd_weight=self.svd_weight)
        temp_hybrid.train(temp_ratings, self.movies_df)

        user_high_rated = train_data[train_data['rating'] >= min_rating]

        predictions = []

        for _, row in test_data.iterrows():
            movie_id = row['item_id']
            actual_rating = row['rating']

            if movie_id not in self.movies_df['movie_id'].values:
                continue

            hybrid_pred, svd_pred, cb_score = temp_hybrid.get_movie_prediction(
                user_id, movie_id, user_high_rated, min_rating
            )

            movie_info = self.movies_df[self.movies_df['movie_id'] == movie_id].iloc[0]

            predictions.append({
                'movie_id': movie_id,
                'title': movie_info['title'],
                'actual_rating': actual_rating,
                'predicted_rating': hybrid_pred,
                'cf_score': svd_pred,
                'cb_score': cb_score
            })

        predictions_df = pd.DataFrame(predictions)
        return predictions_df.sort_values(by='predicted_rating', ascending=False).head(11)
       # return pd.DataFrame(predictions)
   

**Jaunais main**


In [None]:
def main():
    # Ielādē datukopas ratings.csv un movies.csv
    ratings_df, movies_df = load_data()
   
    # Apmāca matricu faktorizācijas modeli
    svd_model = SVDModel()
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)
    cv_results = svd_model.cross_validate(data)
    print(f"SVD Cross-validation results: {cv_results}")
   
    # Apmāca Hibrīdo modeli
    hybrid_model = HybridRecommender(svd_weight=0.7)
    hybrid_model.train(ratings_df, movies_df)
   
    # Novērtē hibrīdo modeli
    eval_results = hybrid_model.evaluate_hybrid(test_users=20, top_n=20)
   
    # Izdrukā novērtēšanas metrikas
    print("\nHybrīdā modeļa metrikas")
    print(eval_results.describe())
    
    # Izprintē vidējās novērtēšanas metrikas hibrīdajam modelim
    print("\nVidējās hibrīdā modeļa metrikas:")
    print(f"Precizitāte: {eval_results['precision'].mean():.4f}")
    print(f"Atsaukums: {eval_results['recall'].mean():.4f}")
    print(f"F1: {eval_results['f1_score'].mean():.4f}")
    print(f"MAE: {eval_results['mae'].mean():.4f}")
    print(f"RMSE: {eval_results['rmse'].mean():.4f}")
    
    test_user_id = 4
    print(f"\nIeteikumu saraksts lietotājam {test_user_id}:")
    
    recommendations = hybrid_model.recommend(test_user_id, top_n=5)
    
    print("\nTop 5 ieteikumi:")
    for i, (_, row) in enumerate(recommendations.iterrows(), 1):
         print(f"{i}. {row['title']} (Hybrid Score: {row['hybrid_score']:.4f})")

    print(f"\nPrognozētie vērtējumi lietotāja {test_user_id} jau novērtētajām filmām:")

    user_id = 5  
    results = hybrid_model.recommend_on_test_set(user_id=user_id, train_ratio=0.8, min_rating=3.0)

    # Print or analyze the results
    print("Ieteikumi lietotājam ar ID:", user_id)
    print(results)




if __name__ == "__main__":
    main()