In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
books = pd.read_csv('data2/archive/data.csv')

In [3]:
books.head(2)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0


data preprocessing

In [5]:
books.dropna(subset=['title', 'categories', 'average_rating'], inplace=True)

In [6]:

import sqlite3

# Load user ratings from SQLite database
def load_user_ratings():
    conn = sqlite3.connect('user_ratings.db')
    query = "SELECT user_id, book_title, rating FROM ratings"
    
    try:
        df = pd.read_sql(query, conn)
    except:
        df = pd.DataFrame(columns=['user_id', 'book_title', 'rating'])
    
    conn.close()
    return df

# Merge user ratings with book ratings
user_ratings = load_user_ratings()

if not user_ratings.empty:
    book_ratings = books[['title', 'average_rating']].copy()
    merged_ratings = user_ratings.groupby('book_title')['rating'].mean().reset_index()
    merged_ratings.rename(columns={'book_title': 'title', 'rating': 'user_avg_rating'}, inplace=True)
    
    books = books.merge(merged_ratings, on='title', how='left')
    books['final_rating'] = books[['average_rating', 'user_avg_rating']].mean(axis=1)
else:
    books['final_rating'] = books['average_rating']


In [7]:

# Recompute the pivot table with user-adjusted ratings
book_pivot = books.pivot_table(index='title', columns='categories', values='final_rating').fillna(0)
cosine_sim = cosine_similarity(book_pivot)

# Save the updated model
pickle.dump(cosine_sim, open('Recmodel/cosine_sim.pkl', 'wb'))
pickle.dump(book_pivot, open('Recmodel/book_pivot.pkl', 'wb'))


In [8]:
books = books[books['ratings_count'] >= 20]
books.reset_index(drop=True, inplace=True)

creating a pivot table

In [10]:
table_pivot = books.pivot_table(
    index='title',          # Each book is a row
    columns='categories',   # Each genre is a column
    values='average_rating',
    aggfunc='mean',         # Aggregate by mean for each category
    fill_value=0            # Fill missing ratings with 0
)

cosine similarity

In [12]:
cosine_sim = cosine_similarity(table_pivot)

In [13]:

# Load user ratings from the SQLite database
import sqlite3

USER_RATINGS_FILE = 'user_ratings.db'

def load_user_ratings():
    conn = sqlite3.connect(USER_RATINGS_FILE)
    query = "SELECT book_title, rating FROM ratings"
    
    try:
        df = pd.read_sql(query, conn)
    except:
        df = pd.DataFrame(columns=['book_title', 'rating'])
    
    conn.close()
    return df

user_ratings = load_user_ratings()
user_ratings.head()


Unnamed: 0,book_title,rating
0,Practical Demonkeeping,3


In [14]:

# Merge user ratings with the main dataset
books = books.merge(user_ratings, left_on='title', right_on='book_title', how='left')
books['rating'] = books['rating'].fillna(books['average_rating'])  # Use existing ratings if no user rating
books.drop(columns=['book_title'], inplace=True)  # Remove duplicate column

books.head()


Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,user_avg_rating,final_rating,rating
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,,3.85,3.85
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,,3.83,3.83
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,,3.97,3.97
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,,3.93,3.93
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,,4.15,4.15


In [15]:

# Create pivot table with updated ratings
table_pivot = books.pivot_table(index='title', columns='categories', values='rating', fill_value=0)

# Recalculate cosine similarity
cosine_sim = cosine_similarity(table_pivot)

# Save updated models
pickle.dump(table_pivot, open('Recmodel/book_pivot.pkl', 'wb'))
pickle.dump(cosine_sim, open('Recmodel/cosine_sim.pkl', 'wb'))
pickle.dump(books, open('Recmodel/books.pkl', 'wb'))
