# Book Recommender System 
## Data Processing

In [78]:
import pandas as pd
import os
import numpy as np
import yaml
from scipy import sparse
from sklearn.neighbors import NearestNeighbors

## Read Data from Data folder

In [79]:
def load_config(config_path):
    """
    Load the configuration from a YAML file.
    
    Args:
        config_path (str): Path to the YAML configuration file.
        
    Returns:
        dict: Configuration parameters as a dictionary.
    """
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config
def read_data(file_path, config):
    """
    Reads a CSV file and returns a DataFrame.
    """
    try:
        data_dict = {}
        for key, value in config['filename_dictionary'].items():
            if value.endswith('.csv'):
                # Use on_bad_lines='skip' for pandas >= 1.3.0, else use error_bad_lines=False for older versions
                try:
                    data_dict[key] = pd.read_csv(
                        os.path.join(file_path, value),
                        encoding=config['encoding'],
                        sep = str(config['seperator']),
                        on_bad_lines='skip'  # For pandas >= 1.3.0
                    )
                except TypeError:
                    # Fallback for older pandas versions
                    data_dict[key] = pd.read_csv(
                        os.path.join(file_path, value),
                        encoding=config['encoding'],
                        sep=str(config['seperator']),
                        error_bad_lines=False  # Deprecated in newer pandas
                    )
            elif value.endswith('.xlsx'):
                data_dict[key] = pd.read_excel(os.path.join(file_path, value))
            else:
                print(f"Unsupported file format for {value}")
        return data_dict
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

In [80]:
config = load_config(os.path.join(os.getcwd(), "..","config",'config.yaml'))
data_dict = read_data(os.path.join(os.getcwd(), "..","Data"), config)

  data_dict[key] = pd.read_csv(


## Renaming dictionary

In [81]:
rename_dictionary = {
    "ratings": {
        "User-ID": "user_id",
        "ISBN": "isbn",
        "Book-Rating": "rating"
    },
    "users": {
        "User-ID": "user_id",
        "Location": "location",
        "Age": "age"
    },
    "books": {
        "ISBN": "isbn",
        "Book-Title": "title",
        "Book-Author": "author",
        "Year-Of-Publication": "year_of_publication",
        "Publisher": "publisher",
        "Image-URL-S": "image_url_s",
        "Image-URL-M": "image_url_m",
        "Image-URL-L": "image_url"
    }
}

for key in data_dict.keys():
    if isinstance(data_dict[key], pd.DataFrame):
        print(f"DataFrame for {key} has shape: {data_dict[key].shape}")
        data_dict[key].rename(columns = rename_dictionary[key], inplace=True)
    else:
        print(f"{key} is not a DataFrame, it is of type {type(data_dict[key])}")

DataFrame for ratings has shape: (1149780, 3)
DataFrame for users has shape: (278858, 3)
DataFrame for books has shape: (271360, 8)


### EDA and Data processing

In [82]:
users = data_dict['users'].copy()
users
# An user id can't appear multiple times in the users table, so we can drop duplicates
users.drop_duplicates(subset='user_id', inplace=True)
users

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [83]:
ratings = data_dict['ratings'].copy()

# A single user can rate multiple books but not the same book multiple times
ratings = ratings.drop_duplicates(subset=['user_id', 'isbn'])

# Filter users who have rated at least min_books_to_read books
active_users = ratings['user_id'].value_counts()
active_users = active_users[active_users >= config['min_books_to_read']].index

ratings = ratings[ratings['user_id'].isin(active_users)]

In [84]:
# number_of_ratings_per_book = ratings.groupby('isbn')["user_id"].nunique().reset_index()
# number_of_ratings_per_book.rename(columns={"user_id": "number_of_ratings"}, inplace=True)
# number_of_ratings_per_book = number_of_ratings_per_book[number_of_ratings_per_book['number_of_ratings'] >= config['min_number_of_ratings']].reset_index(drop=True)
# number_of_ratings_per_book

In [85]:
books = data_dict['books'].copy()
# A book can appear multiple times in the books table, so we can drop duplicates   
books.drop_duplicates(subset=['isbn'], inplace=True)
books.drop_duplicates(subset=['title'], inplace=True)
# Maintianing only the large url
books = books[['isbn', 'title', 'author', 'year_of_publication', 'publisher', 'image_url']].copy()


books

Unnamed: 0,isbn,title,author,year_of_publication,publisher,image_url
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...
271354,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,http://images.amazon.com/images/P/0449906736.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...


In [86]:
final_ratings = ratings.merge(books, on='isbn', how='left')
#final_ratings = final_ratings.merge(users, on='user_id', how='left')
final_ratings["number_of_ratings"] = final_ratings.groupby('isbn')['rating'].transform('count')
final_ratings = final_ratings[final_ratings['number_of_ratings'] >= config['min_number_of_ratings']].reset_index(drop=True)
final_ratings.drop_duplicates(subset= ['user_id','title'],inplace=True)

# drop nulls

final_ratings.dropna(subset=['isbn', 'title', 'author', 'year_of_publication', 'publisher', 'image_url'], inplace=True)
final_ratings.head()

Unnamed: 0,user_id,isbn,rating,title,author,year_of_publication,publisher,image_url,number_of_ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,80
1,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,133
2,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,108
3,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,108
4,277427,0140067477,0,The Tao of Pooh,Benjamin Hoff,1983,Penguin Books,http://images.amazon.com/images/P/0140067477.0...,68


## Pivot Table

In [93]:
def save_df_as_csv(df, filename, folder="Data"):
    """
    Save a DataFrame as a CSV file in the specified folder.
    Creates the folder if it does not exist.

    Args:
        df (pd.DataFrame): DataFrame to save.
        filename (str): Name of the CSV file.
        folder (str): Folder path to save the file (default: "Data").
    """
    save_path = os.path.join(os.getcwd(), "..", folder)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    file_path = os.path.join(save_path, filename)
    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}")


In [94]:
user_ratings_pivot = final_ratings.pivot(index = ["isbn",'title'],columns = 'user_id',values = 'rating').reset_index()

#save the user_ratings_pivot to a csv file
save_df_as_csv(user_ratings_pivot, "user_ratings_pivot.csv")

# book list available for each user
book_list = user_ratings_pivot['title'].tolist()

user_rating_sparse = sparse.csr_matrix(user_ratings_pivot.set_index(['isbn','title']).fillna(0).values)
user_rating_sparse

DataFrame saved to /Users/satwikboina/Documents/BookRecommenderSystem/Notebooks/../Data/user_ratings_pivot.csv


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8748 stored elements and shape (439, 887)>

## Model Building

In [89]:
model = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=config['neighbors_size'])
model.fit(user_rating_sparse)

def get_recommendations(title, model, user_ratings_pivot, n_recommendations=config['neighbors_size']):
    """
    Get book recommendations based on a given title
    
    Args:
        isbn (str): The ISBN of the book to base recommendations on.
        model: The trained NearestNeighbors model.
        user_ratings_pivot (DataFrame): The pivoted user ratings DataFrame.
        n_recommendations (int): Number of recommendations to return.
        
    Returns:
        list: List of recommended book titles.
    """
    if title not in user_ratings_pivot['title'].values:
        return []

    book_index = user_ratings_pivot[user_ratings_pivot['title'] == title].index[0]
    distances, indices = model.kneighbors(user_rating_sparse[book_index], n_neighbors=n_recommendations + 1)
    
    recommended_books = []
    for i in range(1, len(distances.flatten())):
        recommended_books.append(user_ratings_pivot.iloc[indices.flatten()[i]]['title'])
    
    return recommended_books


In [95]:
for book in ["Harry Potter and the Chamber of Secrets (Book 2)"]:
    recommendations = get_recommendations(book, model, user_ratings_pivot, n_recommendations=5)
    print(f"Recommendations for '{book}': {recommendations}")

Recommendations for 'Harry Potter and the Chamber of Secrets (Book 2)': ["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))", 'F Is for Fugitive (Kinsey Millhone Mysteries (Paperback))', 'Portrait in Death', 'Night Whispers', "Charlotte's Web (Trophy Newbery)"]
