In [19]:
# Крок 1: Імпорти та налаштування
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

In [20]:
# Крок 2: Завантаження даних
ratings_df = pd.read_csv('Ratings.csv')
users_df = pd.read_csv('Users.csv')

print("Dataset shapes:")
print(f"Ratings: {ratings_df.shape}")
print(f"Users: {users_df.shape}")

# Display basic info
print("\nRatings Dataset Info:")
print(ratings_df.head())
print(f"\nRating range: {ratings_df['Book-Rating'].min()} - {ratings_df['Book-Rating'].max()}")
print(f"Unique users: {ratings_df['User-ID'].nunique()}")
print(f"Unique books: {ratings_df['ISBN'].nunique()}")

Dataset shapes:
Ratings: (1149780, 3)
Users: (278858, 3)

Ratings Dataset Info:
   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6

Rating range: 0 - 10
Unique users: 105283
Unique books: 340556


In [21]:
# Крок 3: Попередня обробка
def preprocess_data(ratings_df, min_ratings_per_user=6, min_ratings_per_book=10):
    """
    Preprocess the ratings data by filtering and normalizing
    """
    print("Starting data preprocessing...")
    
    # Remove ratings with score 0 (implicit feedback, not actual ratings)
    ratings_filtered = ratings_df[ratings_df['Book-Rating'] > 0].copy()
    print(f"After removing 0 ratings: {ratings_filtered.shape[0]} ratings")
    
    # Filter users with minimum ratings
    user_counts = ratings_filtered['User-ID'].value_counts()
    active_users = user_counts[user_counts >= min_ratings_per_user].index
    ratings_filtered = ratings_filtered[ratings_filtered['User-ID'].isin(active_users)]
    print(f"After user filtering (min {min_ratings_per_user} ratings): {ratings_filtered.shape[0]} ratings")
    print(f"Active users: {len(active_users)}")
    
    # Filter books with minimum ratings
    book_counts = ratings_filtered['ISBN'].value_counts()
    popular_books = book_counts[book_counts >= min_ratings_per_book].index
    ratings_filtered = ratings_filtered[ratings_filtered['ISBN'].isin(popular_books)]
    print(f"After book filtering (min {min_ratings_per_book} ratings): {ratings_filtered.shape[0]} ratings")
    print(f"Popular books: {len(popular_books)}")
    
    # Normalize ratings from 1-10 to 1-5 scale
    ratings_filtered['Book-Rating-Normalized'] = np.round(
        (ratings_filtered['Book-Rating'] - 1) * 4 / 9 + 1
    ).astype(int)
    
    # Ensure ratings are within 1-5 range
    ratings_filtered['Book-Rating-Normalized'] = np.clip(
        ratings_filtered['Book-Rating-Normalized'], 1, 5
    )
    
    print(f"Original rating range: {ratings_filtered['Book-Rating'].min()} - {ratings_filtered['Book-Rating'].max()}")
    print(f"Normalized rating range: {ratings_filtered['Book-Rating-Normalized'].min()} - {ratings_filtered['Book-Rating-Normalized'].max()}")
    
    return ratings_filtered

# Preprocess the data
processed_ratings = preprocess_data(ratings_df, min_ratings_per_user=6, min_ratings_per_book=10)

Starting data preprocessing...
After removing 0 ratings: 433671 ratings
After user filtering (min 6 ratings): 329336 ratings
Active users: 12019
After book filtering (min 10 ratings): 90929 ratings
Popular books: 3966
Original rating range: 1 - 10
Normalized rating range: 1 - 5


In [22]:
# Крок 4: Створення User-Item матриці
def create_user_item_matrix(ratings_df):
    """
    Create user-item matrix for collaborative filtering
    """
    # Create user and item mappings
    unique_users = ratings_df['User-ID'].unique()
    unique_books = ratings_df['ISBN'].unique()
    
    user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
    book_to_idx = {book: idx for idx, book in enumerate(unique_books)}
    idx_to_user = {idx: user for user, idx in user_to_idx.items()}
    idx_to_book = {idx: book for book, idx in book_to_idx.items()}
    
    print(f"Matrix dimensions: {len(unique_users)} users x {len(unique_books)} books")
    
    # Create the matrix
    matrix = np.zeros((len(unique_users), len(unique_books)))
    
    for _, row in ratings_df.iterrows():
        user_idx = user_to_idx[row['User-ID']]
        book_idx = book_to_idx[row['ISBN']]
        matrix[user_idx, book_idx] = row['Book-Rating-Normalized']
    
    # Calculate sparsity
    sparsity = (matrix == 0).sum() / (matrix.shape[0] * matrix.shape[1]) * 100
    print(f"Matrix sparsity: {sparsity:.2f}%")
    
    return matrix, user_to_idx, book_to_idx, idx_to_user, idx_to_book

# Create matrix
print("Creating rating matrix...")
rating_matrix, user_to_idx, book_to_idx, idx_to_user, idx_to_book = create_user_item_matrix(processed_ratings)

Creating rating matrix...
Matrix dimensions: 10309 users x 3966 books
Matrix sparsity: 99.78%


In [23]:
# Крок 5: Розділення даних
def create_train_test_split(ratings_df, test_size=0.2):
    """
    Split ratings data into train and test sets per user
    """
    train_data = []
    test_data = []
    
    for user_id in ratings_df['User-ID'].unique():
        user_ratings = ratings_df[ratings_df['User-ID'] == user_id]
        
        if len(user_ratings) >= 6:  # Ensure enough ratings for split
            train_user, test_user = train_test_split(
                user_ratings, test_size=test_size, random_state=42
            )
            train_data.append(train_user)
            test_data.append(test_user)
        else:
            train_data.append(user_ratings)
    
    train_df = pd.concat(train_data, ignore_index=True)
    test_df = pd.concat(test_data, ignore_index=True) if test_data else pd.DataFrame()
    
    print(f"Train set: {len(train_df)} ratings")
    print(f"Test set: {len(test_df)} ratings")
    
    return train_df, test_df

# Split the data
train_ratings, test_ratings = create_train_test_split(processed_ratings)

Train set: 73962 ratings
Test set: 16967 ratings


In [24]:
# Крок 6: SVD Recommender
class SVDRecommender:
    def __init__(self, n_components=50, random_state=42):
        self.n_components = n_components
        self.random_state = random_state
        self.svd = TruncatedSVD(n_components=n_components, random_state=random_state)
        self.user_mean = None
        self.is_fitted = False
        
    def fit(self, user_item_matrix):
        """
        Train the SVD model
        """
        print("Training SVD model...")
        
        # Calculate user means for centering
        self.user_mean = np.array([
            row[row > 0].mean() if len(row[row > 0]) > 0 else 0 
            for row in user_item_matrix
        ])
        
        # Mean-center the matrix
        centered_matrix = user_item_matrix.copy()
        for i in range(len(centered_matrix)):
            mask = centered_matrix[i] > 0
            if mask.any():
                centered_matrix[i][mask] -= self.user_mean[i]
        
        # Apply SVD
        self.user_factors = self.svd.fit_transform(centered_matrix)
        self.item_factors = self.svd.components_.T
        
        # Reconstruct the matrix
        self.reconstructed = np.dot(self.user_factors, self.item_factors.T)
        
        # Add back user means
        for i in range(len(self.reconstructed)):
            self.reconstructed[i] += self.user_mean[i]
        
        self.is_fitted = True
        print(f"Model trained with {self.n_components} components")
        print(f"Explained variance ratio: {self.svd.explained_variance_ratio_.sum():.4f}")
        
    def predict(self, user_idx, item_idx):
        """
        Predict rating for a user-item pair
        """
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")
        
        prediction = self.reconstructed[user_idx, item_idx]
        return np.clip(prediction, 1, 5)  # Ensure prediction is within valid range
    
    def recommend_items(self, user_idx, user_item_matrix, n_recommendations=10):
        """
        Recommend items for a user
        """
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making recommendations")
        
        # Get user's ratings
        user_ratings = user_item_matrix[user_idx]
        
        # Get items user hasn't rated
        unrated_items = np.where(user_ratings == 0)[0]
        
        # Predict ratings for unrated items
        predictions = []
        for item_idx in unrated_items:
            pred_rating = self.predict(user_idx, item_idx)
            predictions.append((item_idx, pred_rating))
        
        # Sort by predicted rating
        predictions.sort(key=lambda x: x[1], reverse=True)
        
        return predictions[:n_recommendations]

# Create training matrix
train_matrix, train_user_to_idx, train_book_to_idx, train_idx_to_user, train_idx_to_book = create_user_item_matrix(train_ratings)

# Initialize and train the model
recommender = SVDRecommender(n_components=50)
recommender.fit(train_matrix)

Matrix dimensions: 10309 users x 3966 books
Matrix sparsity: 99.82%
Training SVD model...
Model trained with 50 components
Explained variance ratio: 0.1786


In [25]:
# Крок 7: Оцінка моделі
def evaluate_model(recommender, test_ratings, train_user_to_idx, train_book_to_idx):
    """
    Evaluate the model on test data
    """
    predictions = []
    actuals = []
    
    for _, row in test_ratings.iterrows():
        user_id = row['User-ID']
        book_isbn = row['ISBN']
        actual_rating = row['Book-Rating-Normalized']
        
        # Check if user and book are in training data
        if user_id in train_user_to_idx and book_isbn in train_book_to_idx:
            user_idx = train_user_to_idx[user_id]
            book_idx = train_book_to_idx[book_isbn]
            
            predicted_rating = recommender.predict(user_idx, book_idx)
            predictions.append(predicted_rating)
            actuals.append(actual_rating)
    
    if predictions:
        rmse = np.sqrt(mean_squared_error(actuals, predictions))
        mae = mean_absolute_error(actuals, predictions)
        
        print(f"Evaluation Results:")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"Number of predictions: {len(predictions)}")
        
        return rmse, mae
    else:
        print("No valid predictions could be made")
        return None, None

# Evaluate the model
if not test_ratings.empty:
    rmse, mae = evaluate_model(recommender, test_ratings, train_user_to_idx, train_book_to_idx)

Evaluation Results:
RMSE: 0.7720
MAE: 0.5838
Number of predictions: 16967


In [26]:
# Крок 8: ЗБЕРЕЖЕННЯ МОДЕЛІ ДЛЯ DJANGO
model_data = {
    'recommender': recommender,
    'user_to_idx': train_user_to_idx,
    'book_to_idx': train_book_to_idx,
    'idx_to_user': train_idx_to_user,
    'idx_to_book': train_idx_to_book,
    'processed_ratings': processed_ratings,
    'user_item_matrix': train_matrix
}

joblib.dump(model_data, 'svd_recommender_clean.pkl')
print("Model saved successfully!")

Model saved successfully!
