In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error, mean_absolute_error

from math import sqrt


In [None]:
# Load data
book_data = pd.read_csv('prepared_bookdata.csv')

## Data Preprocessing

In [None]:
# Rename columns
book_data.rename(columns={'authors': 'Authors', 'description': 'Description', 'publisher': 'Publisher', 'publishedDate':'Published Date', 'rating': 'Rating', 'categories':'Categories', 'ratingsCount': 'Ratings Count'}, inplace=True)


In [None]:
# Combine relevant text fields into a single text input 
book_data['combined_text'] = book_data.apply(lambda row: ' '.join([
    str(row['Title']),
    str(row['Description']),
    str(row['Authors']),
    str(row['Publisher']),
    str(row['Published Date']),
    str(row['Categories'])
]), axis=1)

In [None]:
# Create a mapping from book titles to their indices
indices = pd.Series(book_data.index, index=book_data['Title']).drop_duplicates()


## Modelling

In [None]:
# Create TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
# Transform documents into TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(book_data['combined_text'])


In [None]:
# Function to calculate cosine similarity
def calculate_similarity_on_demand(idx, tfidf_matrix):
    cosine_similarities = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    return cosine_similarities

In [None]:
# Define recommendation function to find similar books
def get_recommendations_on_demand(query, search_by= 'title', n_recommendations=10):
    sim_scores = None
    if search_by == 'title':
        # Filter books where title starts with the query
        filtered_indices = book_data[book_data['Title'].str.startswith(query, na=False)].index
        sim_scores = np.zeros(len(book_data))  # Initialize similarity scores array
        for idx in filtered_indices:
            sim_scores += calculate_similarity_on_demand(idx, tfidf_matrix)
        sim_scores /= len(filtered_indices)  # Average similarity scores across filtered books
    elif search_by == 'Authors':
        # Filter books where author contains the query
        filtered_indices = book_data[book_data['Authors'].str.contains(query, case=False, regex=False)].index
        sim_scores = np.zeros(len(book_data))  # Initialize similarity scores array
        for idx in filtered_indices:
            sim_scores += calculate_similarity_on_demand(idx, tfidf_matrix)
        sim_scores /= len(filtered_indices)  # Average similarity scores across filtered books
    
    if sim_scores is not None:
        sim_scores = list(enumerate(sim_scores))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        book_indices = [i[0] for i in sim_scores[:n_recommendations]]
        
        recommendations = book_data[['Title', 'Authors', 'Categories', 'Rating', 'Ratings Count', 'Published Date']].iloc[book_indices]
        recommendations['Similarity Score'] = [score for _, score in sim_scores[:n_recommendations]]
        
        return recommendations
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no valid search_by option


In [None]:
# Clean author names by removing the square brackets
book_data['Authors'] = book_data['Authors'].apply(lambda x: x.strip("[]").replace("'", ""))


In [None]:
# Clean category names (if needed)
book_data['Categories'] = book_data['Categories'].apply(lambda x: x.strip("[]").replace("'", ""))

In [None]:
book_data.head()

In [None]:
# Just an example
example_title = book_data['Title'].iloc[0]

In [None]:
recommendations = get_recommendations_on_demand(example_title)

In [None]:
recommendations

### The Evaluation Part ###

### Predict ratings for each user-book pair based on similarities ###

In [None]:
# Create a mapping from book titles to indices
indices = pd.Series(book_data.index, index=book_data['Title']).drop_duplicates()


In [None]:
# Compute cosine similarity between items
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Function to get the predicted rating of a book for a user
def predict_rating(user_id, book_index, user_ratings, sim_matrix):
    sim_scores = sim_matrix[book_index]
    user_rated_indices = user_ratings[user_ratings['User_id'] == user_id].index
    valid_indices = [i for i in user_rated_indices if i < len(sim_scores)]
    user_ratings_values = user_ratings.loc[valid_indices, 'Rating'].values
    if len(user_ratings_values) == 0:
        return np.mean(user_ratings['Rating'])  # Return the global average if no ratings
    relevant_sim_scores = sim_scores[valid_indices]
    if np.sum(relevant_sim_scores) == 0:
        return np.mean(user_ratings['Rating'])  # Return the global average if no similarities
    weighted_sum = np.dot(relevant_sim_scores, user_ratings_values)
    sum_of_sim_scores = np.sum(relevant_sim_scores)
    return weighted_sum / sum_of_sim_scores

In [None]:
# Generate predictions for all user-item pairs
user_ids = book_data['User_id'].unique()
predictions = []
for user_id in user_ids:
    user_ratings = book_data[book_data['User_id'] == user_id]
    for index, row in user_ratings.iterrows():
        if row['Title'] not in indices:
            continue
        book_index = indices[row['Title']]
        predicted_rating = predict_rating(user_id, book_index, book_data, cosine_sim)
        predictions.append((user_id, row['Title'], predicted_rating, row['Rating']))


In [None]:
# Create a DataFrame with predictions and actual ratings
pred_df = pd.DataFrame(predictions, columns=['User_id', 'Title', 'Predicted_Rating', 'Actual_Rating'])
pred_top10 = pred_df.head(10)
pred_top10

In [None]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(pred_df['Predicted_Rating'], pred_df['Actual_Rating']))
# Calculate MAE
mae = mean_absolute_error(pred_df['Actual_Rating'], pred_df['Predicted_Rating'])

print(f"RMSE, {rmse}")
print(f"MAE, {mae}")
