In [13]:
import pandas as pd
import numpy as np

In [15]:
df = pd.read_csv('cleaned_book_store_data.csv', delimiter=',', encoding='ISO-8859-1')


In [16]:
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M,User-ID,Book-Rating,Country
0,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5,canada
1,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,8,canada
2,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,9,usa
3,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,9,canada
4,2005018,Clara Callan,richard bruce wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,200273,8,canada


In [11]:
df.shape

(376904, 9)

# Rating Base Filtering

In [42]:
#Grouping the dataset by the columns Book-Title, Book-Author,Year-Of-Publication,Publisher and Image-URL-M.
average_ratings= df.groupby(['Book-Title', 'Book-Author','Year-Of-Publication','Publisher','Image-URL-M'])['Book-Rating'].mean().reset_index()

In [43]:
#sorts in descending order of ratings
top_rated_books = average_ratings.sort_values(by='Book-Rating' , ascending=False)

In [46]:
#convert book-rating column to an integer format
top_rated_books['Book-Rating'] = top_rated_books['Book-Rating'].astype(int)

In [48]:
top_rated_books = top_rated_books.head(10)

In [50]:
print("Rating base Recommendation System: (Trending books)")
top_rated_books

Rating base Recommendation System: (Trending books)


Unnamed: 0,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M,Book-Rating
74189,"Mister Parker Pyne, Detective",agatha christie,1981,Bantam Books (Mm),http://images.amazon.com/images/P/0440158885.0...,10
84697,Pei pan,ramlee awang murshid,2002,Utusan Publications & Distributors,http://images.amazon.com/images/P/9676112984.0...,10
20460,Challenging Word Games,mayme allen,1997,Sterling Publishing,http://images.amazon.com/images/P/0806998547.0...,10
84642,Pearl and Bead Stringing With Henrietta,henrietta virchick,1990,"Linecraft, Incorporated",http://images.amazon.com/images/P/0962713708.0...,10
84645,Pearl's Secret: A Black Man's Search for His W...,neil henry,2001,University of California Press,http://images.amazon.com/images/P/0520222571.0...,10
20452,Challenge of the Clans,kenneth c. flint,1986,Spectra Books,http://images.amazon.com/images/P/0553255533.0...,10
84665,Peck's Bad Boy and His Pa,george w. peck,1990,Buccaneer Books Inc,http://images.amazon.com/images/P/0899667503.0...,10
84667,Peculiar Chris,johann s lee,1992,Cannon International,http://images.amazon.com/images/P/9810035578.0...,10
20444,Chalkdust: Prayer Meditations for a Teacher,elspeth c. murphy,1978,Baker Book House,http://images.amazon.com/images/P/0801060656.0...,10
20441,Chakras For Beginners,naomi ozaniec,1999,Headway Books,http://images.amazon.com/images/P/0340742445.0...,10


# Content Base Filtering

### suggest books to users based on the features and the user's preferences

#### Tf-IDF Vectorizer -> transform text data into numerical data
#### Tf -> how frequently a word appears in a doc
#### IDF -> meatures the importance of a word

#### cosine similarity -> meature how similar to vectors are regardless of its magnitude

In [52]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

In [53]:

# Function to recommend books by the same author, processing in batches
def recommend_books_by_author_batch(df, book_name, top_n=5, batch_size=10000):
    if book_name not in df['Book-Title'].values:
        print(f"Item '{book_name}' not found in the system.")
        return pd.DataFrame()
    
    # Get the author of the input book
    target_author = df[df['Book-Title'] == book_name]['Book-Author'].values[0]
    
    # Split the dataframe into batches
    num_batches = (len(df) // batch_size) + 1
    all_similar_books = []

    for batch_num in range(num_batches):
        print(f"Processing batch {batch_num + 1} of {num_batches}...")

        # Slice the dataframe for the current batch
        batch_df = df.iloc[batch_num * batch_size:(batch_num + 1) * batch_size]

        # Apply TF-IDF vectorizer on the 'Book-Author' column in the current batch
        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix_content = tfidf_vectorizer.fit_transform(batch_df['Book-Author'])

        # Calculate cosine similarity matrix for the current batch
        cosine_similarities_content = cosine_similarity(tfidf_matrix_content, tfidf_matrix_content)

        # If the book is in the current batch, calculate similarity
        if book_name in batch_df['Book-Title'].values:
            book_index = batch_df[batch_df['Book-Title'] == book_name].index[0] - (batch_num * batch_size)

            # Get similarities for the book within the current batch
            similar_books = list(enumerate(cosine_similarities_content[book_index]))
            similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)
            top_similar_books = similar_books[1:top_n+1]

            # Convert the batch indices back to global indices
            top_similar_books = [(i + batch_num * batch_size, score) for i, score in top_similar_books]

            # Append similar books to the final list
            all_similar_books.extend(top_similar_books)

    # Sort all similar books across all batches
    all_similar_books = sorted(all_similar_books, key=lambda x: x[1], reverse=True)

    # Filter out books that are not written by the same author
    recommend_items_indices = [x[0] for x in all_similar_books if df.iloc[x[0]]['Book-Author'] == target_author]

    # If there are no other books by the same author, return a message
    if len(recommend_items_indices) == 0:
        print(f"No other books found by author {target_author}.")
        return pd.DataFrame()

    # Get the recommended book details
    recommend_book_details = df.iloc[recommend_items_indices][['Book-Title', 'Book-Author', 'Image-URL-M']].head(top_n)

    return recommend_book_details

In [56]:

book_name = 'Open Heart (Harvest Book)'
content_based_rec = recommend_books_by_author_batch(df, book_name, top_n=5, batch_size=10000)
content_based_rec


Processing batch 1 of 38...
Processing batch 2 of 38...
Processing batch 3 of 38...
Processing batch 4 of 38...
Processing batch 5 of 38...
Processing batch 6 of 38...
Processing batch 7 of 38...
Processing batch 8 of 38...
Processing batch 9 of 38...
Processing batch 10 of 38...
Processing batch 11 of 38...
Processing batch 12 of 38...
Processing batch 13 of 38...
Processing batch 14 of 38...
Processing batch 15 of 38...
Processing batch 16 of 38...
Processing batch 17 of 38...
Processing batch 18 of 38...
Processing batch 19 of 38...
Processing batch 20 of 38...
Processing batch 21 of 38...
Processing batch 22 of 38...
Processing batch 23 of 38...
Processing batch 24 of 38...
Processing batch 25 of 38...
Processing batch 26 of 38...
Processing batch 27 of 38...
Processing batch 28 of 38...
Processing batch 29 of 38...
Processing batch 30 of 38...
Processing batch 31 of 38...
Processing batch 32 of 38...
Processing batch 33 of 38...
Processing batch 34 of 38...
Processing batch 35 of 

Unnamed: 0,Book-Title,Book-Author,Image-URL-M
263322,A Journey to the End of the Millennium: A Novel,a. b. yehoshua,http://images.amazon.com/images/P/0385488823.0...


# Collaborative Filtering - Sayuni

## Recommend books to a user based on the preferences of other users who have rated similar books.

In [58]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets (e.g., 80% training, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)



In [60]:
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
def collaborative_recommendation(df, user_id, top_n=5, batch_size=5000):
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Get unique user IDs and batch them
    user_ids = df['User-ID'].unique()
    num_batches = int(np.ceil(len(user_ids) / batch_size))
    user_id_batches = np.array_split(user_ids, num_batches)
    
    all_recommendations = []

    # Process each batch separately
    for batch_num, user_batch in enumerate(user_id_batches):
        print(f"Processing batch {batch_num + 1} of {num_batches}...")

        # Filter the dataframe to only include the users in this batch
        batch_train_df = df[df['User-ID'].isin(user_batch)]
        
        # Create the user-book matrix for the current batch
        user_book_matrix = batch_train_df.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', aggfunc='mean').fillna(0).astype(int)
        
        # Calculate user similarity using cosine similarity
        user_similarity = cosine_similarity(user_book_matrix)
        
        # Check if the target user is in the current batch
        if user_id in user_book_matrix.index:
            target_user_index = user_book_matrix.index.get_loc(user_id)
            
            # Get similarities for the target user
            user_similarities = user_similarity[target_user_index]
            similar_user_indices = user_similarities.argsort()[::-1][1:]
            
            recommended_books = []

            # Recommend books based on the top similar users
            for user_index in similar_user_indices:
                rated_by_similar_user = user_book_matrix.iloc[user_index]
                not_rated_by_target_user = (rated_by_similar_user != 0) & (user_book_matrix.iloc[target_user_index] == 0)
                
                # Get the top recommended books from similar users
                recommended_books.extend(user_book_matrix.columns[not_rated_by_target_user][:top_n])
            
            # Append recommendations to the final list
            all_recommendations.extend(recommended_books)
    
    # Get detailed information about the recommended books
    recommended_books_details = df[df['ISBN'].isin(all_recommendations)][['Book-Title', 'Book-Author', 'Image-URL-M']].drop_duplicates()
    
    return recommended_books_details.head(top_n)


In [64]:
# Call the collaborative recommendation function
user_id = 67544  # user_id
top_n = 5        # Number of top recommendations to retrieve
batch_size = 5000  # Size of batches for processing

# Get the collaborative filtering recommendations for the user
collaborative_rec_output = collaborative_recommendation(df, user_id, top_n=top_n, batch_size=batch_size)

# Print the recommended books
print(f"Top {top_n} collaborative filtering recommendations for User {user_id}:")
collaborative_rec_output


Processing batch 1 of 14...
Processing batch 2 of 14...
Processing batch 3 of 14...
Processing batch 4 of 14...
Processing batch 5 of 14...
Processing batch 6 of 14...
Processing batch 7 of 14...
Processing batch 8 of 14...
Processing batch 9 of 14...
Processing batch 10 of 14...
Processing batch 11 of 14...
Processing batch 12 of 14...
Processing batch 13 of 14...
Processing batch 14 of 14...
Top 5 collaborative filtering recommendations for User 67544:


Unnamed: 0,Book-Title,Book-Author,Image-URL-M
8,Decision in Normandy,carlo d'este,http://images.amazon.com/images/P/0060973129.0...
10,Flu: The Story of the Great Influenza Pandemic...,gina bari kolata,http://images.amazon.com/images/P/0374157065.0...
16,The Kitchen God's Wife,amy tan,http://images.amazon.com/images/P/0399135782.0...
33,Where You'll Find Me: And Other Stories,ann beattie,http://images.amazon.com/images/P/074322678X.0...
34,Nights Below Station Street,david adams richards,http://images.amazon.com/images/P/0771074670.0...


# Hybrid Filtering

In [71]:
# Function to generate hybrid recommendations
def hybrid_recommendation(df, user_id, book_name, top_n=5, content_weight=0.5, collab_weight=0.5, batch_size=5000):
    # 1. Get content-based recommendations (by author in this case)
    content_rec = recommend_books_by_author_batch(df, book_name, top_n=top_n, batch_size=batch_size)
    
    # 2. Get collaborative filtering recommendations
    # We assume that you already have the collaborative filtering recommendations for the user.
    collab_rec = collaborative_recommendation(df, user_id, top_n=top_n, batch_size=batch_size)
    
    # 3. Merge both recommendations (if same book appears, combine scores)
    combined_scores = {}
    
    # Add content-based scores to combined dictionary
    for index, row in content_rec.iterrows():
        combined_scores[row['Book-Title']] = content_weight
    
    # Add collaborative filtering scores to the combined dictionary
    for index, row in collab_rec.iterrows():
        if row['Book-Title'] in combined_scores:
            combined_scores[row['Book-Title']] += collab_weight  # Combine scores if the book appears in both methods
        else:
            combined_scores[row['Book-Title']] = collab_weight
    
    # Sort the combined scores
    sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    
    # 4. Get book details for the top recommendations
    top_recommendations = [item[0] for item in sorted_combined_scores[:top_n]]
    recommend_book_details = df[df['Book-Title'].isin(top_recommendations)][['Book-Title', 'Book-Author', 'Image-URL-M']]
    
    return recommend_book_details

# Example usage
user_id = 67544
book_name = 'Open Heart (Harvest Book)'
hybrid_rec = hybrid_recommendation(df, user_id, book_name, top_n=5, content_weight=0.6, collab_weight=0.4, batch_size=5000)
hybrid_rec


Processing batch 1 of 76...
Processing batch 2 of 76...
Processing batch 3 of 76...
Processing batch 4 of 76...
Processing batch 5 of 76...
Processing batch 6 of 76...
Processing batch 7 of 76...
Processing batch 8 of 76...
Processing batch 9 of 76...
Processing batch 10 of 76...
Processing batch 11 of 76...
Processing batch 12 of 76...
Processing batch 13 of 76...
Processing batch 14 of 76...
Processing batch 15 of 76...
Processing batch 16 of 76...
Processing batch 17 of 76...
Processing batch 18 of 76...
Processing batch 19 of 76...
Processing batch 20 of 76...
Processing batch 21 of 76...
Processing batch 22 of 76...
Processing batch 23 of 76...
Processing batch 24 of 76...
Processing batch 25 of 76...
Processing batch 26 of 76...
Processing batch 27 of 76...
Processing batch 28 of 76...
Processing batch 29 of 76...
Processing batch 30 of 76...
Processing batch 31 of 76...
Processing batch 32 of 76...
Processing batch 33 of 76...
Processing batch 34 of 76...
Processing batch 35 of 

Unnamed: 0,Book-Title,Book-Author,Image-URL-M
8,Decision in Normandy,carlo d'este,http://images.amazon.com/images/P/0060973129.0...
9,Decision in Normandy,carlo d'este,http://images.amazon.com/images/P/0060973129.0...
10,Flu: The Story of the Great Influenza Pandemic...,gina bari kolata,http://images.amazon.com/images/P/0374157065.0...
11,Flu: The Story of the Great Influenza Pandemic...,gina bari kolata,http://images.amazon.com/images/P/0374157065.0...
12,Flu: The Story of the Great Influenza Pandemic...,gina bari kolata,http://images.amazon.com/images/P/0374157065.0...
...,...,...,...
28765,The Kitchen God's Wife,amy tan,http://images.amazon.com/images/P/080410753X.0...
28766,The Kitchen God's Wife,amy tan,http://images.amazon.com/images/P/080410753X.0...
28767,The Kitchen God's Wife,amy tan,http://images.amazon.com/images/P/080410753X.0...
173694,The Kitchen God's Wife,amy tan,http://images.amazon.com/images/P/080410753X.0...
