In [None]:
import pandas as pd
import re
import numpy as np
import requests
import html
import datetime
import json
import time
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from collections.abc import Iterable

In [None]:
import os
from scipy.sparse import save_npz, load_npz, hstack, vstack
from sklearn.feature_extraction import FeatureHasher
from sklearn.neighbors import NearestNeighbors

In [None]:
nltk.download('wordnet')

In [None]:
books_df = pd.read_csv('books.csv', sep=";", error_bad_lines=False, encoding="latin-1")
ratings_df = pd.read_csv('ratings.csv', sep=";", error_bad_lines=False, encoding="latin-1")
users_df = pd.read_csv('users.csv', sep=";", error_bad_lines=False, encoding="latin-1")

In [None]:
print(books_df.shape)
print(ratings_df.shape)
print(users_df.shape)

# Exploration books data set

In [None]:
books_df.info()

In [None]:
books_df.head(10)

In [None]:
# Unnecessary columns dropped
books_df = books_df.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'])
books_df

In [None]:
# Function for check if ISBN are valid, defined with regex isbn10 and isbn13 and created a function to detect suspicious ISBNs
def is_valid_isbn(isbn):
    isbn = re.sub(r'[-\s]', '', isbn)
    isbn_10_pattern = re.compile(r'^\d{9}[\dXx]$')
    isbn_13_pattern = re.compile(r'^\d{13}$')
    return bool(isbn_10_pattern.match(isbn)) or bool(isbn_13_pattern.match(isbn))

invalid_isbn_books = books_df[~books_df['ISBN'].apply(is_valid_isbn)]
invalid_isbn_books 

In [None]:
books_df = books_df[books_df['ISBN'].apply(is_valid_isbn)].reset_index(drop=True)
books_df

In [None]:
books_df['Year-Of-Publication'].unique()

There are some suspicious Year of publication as 0 or higher than current year, decided to replace the value with none\
Invalid Year as DK Publishing Inc and Gallimard needs to be shift to publisher

In [None]:
invalid_year_rows = books_df[~books_df['Year-Of-Publication'].astype(str).str.isnumeric()]
invalid_year_rows

In [None]:
indxtoshift = invalid_year_rows.index
for idx in indxtoshift:
    books_df.at[idx, 'Publisher'] = books_df.at[idx, 'Year-Of-Publication']
    books_df.at[idx, 'Year-Of-Publication'] = books_df.at[idx, 'Book-Author']
    books_df.at[idx, 'Book-Author'] = None 
books_df.loc[indxtoshift]

In [None]:
# The column has combination of str, and int, decided to convert all the value to int 
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce') 
books_df['Year-Of-Publication'] = books_df['Year-Of-Publication'].astype(pd.Int64Dtype())

# Replacing value that are equal to 0 or higher than curren year with na
books_df.loc[books_df['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = pd.NA
books_df.loc[books_df['Year-Of-Publication'] > int(datetime.date.today().strftime('%Y')), 'Year-Of-Publication'] = pd.NA

In [None]:
books_df['author_name_length'] = books_df['Book-Author'].astype(str).apply(len)
sorted_books_df = books_df.sort_values(by='author_name_length', ascending=False)
sorted_books_df[['Book-Author', 'author_name_length']].head()

In [None]:
books_df.loc[219783][1]

The columns Book-Author, Book-Title and Publisher entries will need to be cleaned of whitespace, Html entities etc - this will be done later after the datasets have been merged.d

In [None]:
# Deleted unnecessary column created in the above cell
books_df = books_df.drop(columns=['author_name_length'])

# Exploration ratings data set

In [None]:
ratings_df.info()

In [None]:
ratings_df[~ratings_df['ISBN'].apply(is_valid_isbn)]

Checked ISBN in ratings df and there is 10159 rows with non valid ISBN, decided to drop them.

In [None]:
ratings_df = ratings_df[ratings_df['ISBN'].apply(is_valid_isbn)].reset_index(drop=True)

In [None]:
ratings_df["Book-Rating"].unique()

# Exploration users dataset

In [None]:
users_df.info()

In [None]:
users_df.head()

In [None]:
users_df['Age'].describe()

The maximum and minimum of the values (0 and 244) in the age column indicate that the age column contains incorrect values. I decided to keep only the age range from 6 years to 99 and replaced the other values with nan.

In [None]:
type(users_df['Age'][0])

In [None]:
users_df['Age'] = users_df['Age'].astype(pd.Int64Dtype())

In [None]:
users_df['Age'].unique()

In [None]:
users_df.loc[(users_df['Age'] < 6) | (users_df['Age'] > 99), 'Age'] = np.nan
users_df['Age'] = users_df['Age'].astype(pd.Int64Dtype())

In [None]:
users_df[users_df['Age'].isnull()]

Apparently ~40% do not seem to have age information filled in

## Merge data frames and clean data

In [None]:
# Function to check if there are white spaces, HTML entits and HTML tags or double spaces
def analyze_text_issues(text):
    double_space = bool(re.search(r'  ', text))
    html_tag = bool(re.search(r'<.*?>', text))
    html_entity = text != html.unescape(text)
    extra_whitespace = text != text.strip()

    return double_space, html_tag, html_entity, extra_whitespace

# Function which use function analyze_text_issues in columns to display a summary of the issue in columns
def summarize_column_issues(df, columns_to_inspect):
    summary = {}
    for column in columns_to_inspect:
        results = df[column].astype(str).dropna().apply(analyze_text_issues)
        summary[column] = {
            'double_spaces': results.apply(lambda x: x[0]).sum(),
            'html_tags': results.apply(lambda x: x[1]).sum(),
            'html_entities': results.apply(lambda x: x[2]).sum(),
            'extra_whitespace': results.apply(lambda x: x[3]).sum()
        }
    return summary

In [None]:
# Function to clean data from white spaces, HTML entits and HTML tags or double spaces
def clean_text_data(text):
    previous_text = ""
    while previous_text != text:
        previous_text = text
        text = re.sub(r'<.*?>', '', text)
        text = text.replace('  ', ' ')
        text = html.unescape(text)
        text = text.strip()
    return text

In [None]:
# Merge user and ratings info to get one dataframe with user-ratings info.
user_ratings_df = pd.merge(ratings_df, users_df, on='User-ID', how='inner')

In [None]:
# Dataframe user_ratings_df grouped by ISBN and aggregating the information about age, book rating and user-id in list
# The final merge contains the main book dataframe with information about rating, age and user- id aggregated in list
isbn_user_ratings = user_ratings_df.groupby('ISBN').agg({
    'User-ID': list,
    'Book-Rating': list,
    'Age': list 
})
merged_df = pd.merge(books_df, isbn_user_ratings, on='ISBN', how='left')

In [None]:
merged_df.head()

In [None]:
columns_to_check = ['Book-Title', 'Book-Author', 'Publisher']
summary = summarize_column_issues(merged_df, columns_to_check)
summary

In [None]:
# Apply function clean_text_data on columns 'Book-Title', 'Book-Author', 'Publisher'
for column in ['Book-Title', 'Book-Author', 'Publisher']:
    merged_df[column] = merged_df[column].astype(str).apply(clean_text_data)
merged_df[['Book-Title', 'Book-Author', 'Publisher']].head()

In [None]:
merged_df.isnull().sum()

Several columns have empty values, for example Year-Of-Publication, Publisher can be checked to get data from the API.

## Getting data from API

In [None]:
# Function to create a list of query where each element has 100 unique ISBN, which meets the conditions of the API.
def create_list_query(merged_df):
    unique_isbn = merged_df['ISBN'].unique()
    isbn_list = unique_isbn.tolist()
    chunked_isbn_list = [isbn_list[i:i + 100] for i in range(0, len(isbn_list), 100)]
    list_query = []
    for chunk in chunked_isbn_list:
        isbn_str = ",".join(f"ISBN:{isbn}" for isbn in chunk)
        url = f"https://openlibrary.org/api/books?bibkeys={isbn_str}&jscmd=details&format=json"
        list_query.append(url)
    return list_query

In [None]:
# Uncomment to get a list of query 
# list_query = create_list_query(merged_df)

In [None]:
# Function to get data and continuosly save them in a new line in data.json file, if error will occure it will give the url and status code
def get_data(list_query):
    with open('data.json', 'a') as output_file:  
        for url in list_query:
                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()
                    json.dump(data, output_file)
                    output_file.write('\n')  # Newline character to separate JSON objects
                else:
                    print(f"Request failed for {url}. Status code: {response.status_code}")
                time.sleep(1)

In [None]:
# Uncomment to get data from API
# get_data(list_query)

In [None]:
# Function to get data from json file obtained from api and transform them to a dataframe
def extract_book_data(file_path):
    all_books = []
    # open the file and iterate each line and parsing the json object
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())
            # for the json object get the information for ISBN
            for isbn, book_info in json_obj.items():
                if isinstance(book_info, dict):
                    details = book_info.get('details', {})
                    
                    if isinstance(details, dict):
                        title = details.get('title')
                        number_of_pages = details.get('number_of_pages')
                        publish_date = details.get('publish_date')

                        publishers = details.get('publishers', [])
                        publisher = publishers[0] if publishers else None
                        
                        if isinstance(details.get('subjects', []), list):
                            subjects = details.get('subjects', [])
                        else:
                            subjects = []

                        if isinstance(details.get('genres', []), list):
                            genres = details.get('genres', [])
                        else:
                            genres = []

                        book_data = {
                            'ISBN': isbn,
                            'Title': title,
                            'Number_of_Pages': number_of_pages,
                            'Publisher': publisher,
                            'publish_date': publish_date,
                            'Subjects': ', '.join(subjects), 
                            'Genres': ', '.join(genres) 
                        }
                        all_books.append(book_data)

    return pd.DataFrame(all_books)

In [None]:
new_data_api = 'data.json' 
external_books_df = extract_book_data(new_data_api)
external_books_df.head(10)

### Cleaning data from API

In [None]:
# Removed ISBN: in columns ISBN
external_books_df['ISBN'] = external_books_df['ISBN'].str.replace('ISBN:', '', regex=False)

In [None]:
# Page number from float to int
external_books_df['Number_of_Pages'] = external_books_df['Number_of_Pages'].astype(pd.Int64Dtype())

In [None]:
columns_to_check = ['Title', 'Subjects', 'Publisher', 'Genres']
summary = summarize_column_issues(external_books_df, columns_to_check)
summary

In [None]:
for column in ['Title', 'Subjects', 'Publisher', 'Genres']:
    external_books_df[column] = external_books_df[column].astype(str).apply(clean_text_data)
external_books_df[['Title', 'Subjects', 'Publisher', 'Genres']].head()

In [None]:
# Convert the 'publish_date' column to datetime and keeping only the information about year
external_books_df['publish_date'] = pd.to_datetime(external_books_df['publish_date'], errors='coerce')
external_books_df['year'] = external_books_df['publish_date'].dt.year

In [None]:
external_books_df['year'] = external_books_df['publish_date'].dt.year

# Drop the original 'publish_date' column as we now have the year
external_books_df = external_books_df.drop('publish_date', axis=1)

# Return the first few rows of the modified DataFrame to check the result
external_books_df.head()

In [None]:
# Converting year value to int
external_books_df['year'] = external_books_df['year'].astype(pd.Int64Dtype())

In [None]:
# Filling information about year which is present in external_book_df and missing in merged_df
# It filled 3574 values
comparison_df = pd.merge(merged_df[['ISBN', 'Year-Of-Publication']], external_books_df[['ISBN', 'year']], on='ISBN', how='outer', indicator=True)
condition = (comparison_df['Year-Of-Publication'].isna()) & (comparison_df['year'].notna())
isbns_to_update = comparison_df.loc[condition, 'ISBN']
for isbn in isbns_to_update:
    new_year = comparison_df.loc[comparison_df['ISBN'] == isbn, 'year'].iloc[0]
    merged_df.loc[merged_df['ISBN'] == isbn, 'Year-Of-Publication'] = new_year

In [None]:
unique_genres = external_books_df['Genres'].unique()
unique_genres

In [None]:
# Removing text "etc., etc and .", double spaces 
def clean_and_split_genres(genre):
    if pd.isna(genre):
        return genre 
    
    cleaned_genre = re.sub(r'\.', '', genre)
    cleaned_genre = re.sub(r'\betc\b\.?', '', cleaned_genre)
    cleaned_genre = re.sub(r'\s+', ' ', cleaned_genre)
    split_genres = [g.strip() for g in cleaned_genre.split(',') if g.strip()]
    
    return split_genres
external_books_df['Genres'] = external_books_df['Genres'].apply(clean_and_split_genres)

In [None]:
empty_genres_count = external_books_df[external_books_df['Genres'].apply(lambda x: not x)].shape[0]
empty_genres_count

Unfortunatelly there are so many empty list for column 'Genres', that this columns cannot be used in this form.\
The "Subject" column contains many keywords that can be considered a category and help recommend books. In the next steps I will try to extract them and extend category column.


In [None]:
# Convert 'Subjects' entries into lists so it will be possible to apply function clean_and_split_list
external_books_df['Subjects'] = external_books_df['Subjects'].apply(lambda x: [x] if isinstance(x, str) else x)

In [None]:
def clean_and_split_list(text_list):
    # Initialize an empty list to store cleaned words
    cleaned_words = []

    # Check if the list is not empty or null
    if text_list and isinstance(text_list, list):
        for text in text_list:
            if pd.isna(text) or not isinstance(text, str):
                continue 

            text = text.lower()
            cleaned_text = re.sub(r'[^a-z0-9\s-]', ' ', text)  
            cleaned_text = re.sub(r'(?<!\w)-|-(?!\w)', ' ', cleaned_text)
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

            word_list = cleaned_text.split()
            stop_words = set(stopwords.words('english'))
            word_list = [word for word in word_list if word not in stop_words]

            # Extend the main list with the cleaned words
            cleaned_words.extend(word_list)

    return cleaned_words

In [None]:
# Apply the function to both 'Subjects' and 'Genres' columns
external_books_df['Cleaned_Subjects'] = external_books_df['Subjects'].apply(clean_and_split_list)
external_books_df['Cleaned_Genres'] = external_books_df['Genres'].apply(clean_and_split_list)


In [None]:
# Concatenated column Cleaned_Subjects and Cleaned_Genres in one list
external_books_df['Concatenated_S_G'] = external_books_df['Cleaned_Subjects'] + external_books_df['Cleaned_Genres']

In [None]:
# Function to lemmatize text
def lemmatize_words(word_list):
    lemmatizer = WordNetLemmatizer()
    # Lemmatize each word in the list
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_list]
    return lemmatized_words

In [None]:
# Applying the function to lemmatize text do the column Concatenated_S_G
external_books_df['Lemmatized_S_G'] = external_books_df['Concatenated_S_G'].apply(lemmatize_words)

In [None]:
# Used to check which genres are in our data to specify the known genres 
# non_empty_genres = external_books_df['Cleaned_Genres'].dropna().loc[external_books_df['Cleaned_Genres'] != '']
# unique_genres_set = set()
# for genre_list in external_books_df['Cleaned_Genres']:
#     if genre_list:  
#         unique_genres_set.update(genre_list)
# print(list(unique_genres_set))

In [None]:
known_genres = [
    'adventure', 'biography', 'fantasy', 'historical', 'horror', 'literary', 'mystery', 
    'mythology', 'non-fiction', 'philosophical', 'romance', 'satire', 'science', 'juvenile',
    'thriller', 'western', 'young', 'action', 'drama', 'erotica', 'memoir', 'crime', 
    'dystopian', 'self-help', 'travel', 'guide', 'anthology', 'classic', 'comedy', 
    'psychological', 'suspense', 'tragedy', 'fairy', 'folklore', 'legend', 'narrative', 
    'periodical', 'political', 'realistic', 'reference', 'religion', 'short', 'superhero', 
    'supernatural', 'textbook', 'urban', 'utopian', 'war', 'absurdist', 'alternate', 
    'coming-of-age', 'cookbook', 'diary', 'encyclopedia', 'epic', 'experimental', 'fable', 
    'fan', 'gothic', 'graphic', 'hard-boiled', 'historiography', 'humor', 'lab', 'magical', 
    'paranormal', 'picaresque', 'post-apocalyptic', 'stream-of-consciousness', 'sword', 
    'true', 'vampire', 'visionary', 'whodunit', 'non-fiction', 'biology', 'music', 'guidebook',
    'vocabularies', 'design', 'architecture', 'novela', 'archeology', 'tour', 'statistic',
    'anecdotes', 'guidebook', 'manual', 'history', 'child', 'study', 'work', 'dictionaries', 'humor',
    'handbook', 'pictorial', 'personal', 'poetry', 'interview', 'fiction', 'literature', 'guidebooks',
    'social', 'detective', 'life', 'fictitious', 'art'
]


In [None]:
# Function to check for matched from list know_genres in df with list of known genres
def find_genre_matches(lemmatized_genres, known_genres):
    match = [genre for genre in known_genres if genre in lemmatized_genres]
    return match

In [None]:
# Created new column Category where are saved Category of book in list applying function find_genre_matches
external_books_df['Category'] = external_books_df['Lemmatized_S_G'].apply(lambda row: find_genre_matches(row, known_genres))

In [None]:
external_books_df['Category'].apply(lambda x: len(x) == 0).sum()

In [None]:
269364 - 117928


In [None]:
(100/269364) * 151436

In [None]:
external_books_df.to_csv("test_external_books_df.csv", index=False)

I have created a new column named 'Category' that contains more accurate information regarding the categories of various books, presented in list format. This enhancement came after cleaning the 'Subject' and 'Genres' columns, followed by word splitting and the application of lemmatization techniques using a library. Subsequently, I established a list of prevalent book categories. Upon cross-verifying the presence of these categories within the text, I generated a new column. This column boasts a fill rate of approximately 56%. This new column will be used for book recommendation model more accorated.

## Final processing of merged dataframe

Creating a new column for Publication Range

In [None]:
# Creating a new column named 'Publication-Range' whith a range of Year-Of-Publication by decade
bins = list(range(1900, 2030, 10))  
labels = [f"{i}-{i+10}" for i in bins[:-1]]

merged_df['Publication-Range'] = pd.cut(merged_df['Year-Of-Publication'], bins=bins, labels=labels, right=False)

Creating a new column mean rating

In [None]:
# Function to convert Book-Rating to list
def convert_to_list(rating):
    if isinstance(rating, list):
        return rating
    else:
        return []

In [None]:
merged_df['Book-Rating'] = merged_df['Book-Rating'].apply(convert_to_list)

# Calculated the mean rating, taking into account empty lists and ensuring division by zero doesn't occur
merged_df['Mean-Rating'] = merged_df['Book-Rating'].apply(lambda x: sum(x) / len(x) if x else float('nan'))

In [None]:
# Convert the entire 'Mean-Rating' column to native Python data types
merged_df['Mean-Rating'] = merged_df['Mean-Rating'].apply(lambda x: int(x) if isinstance(x, float) and x.is_integer() else x).astype(object)

Join column from external_books_df to final_df

In [None]:
# Extented merged_df of external_book_df on ISBN by selected column
final_merge = pd.merge(merged_df, external_books_df[['ISBN','Category','Number_of_Pages']], on='ISBN', how='left')

In [None]:
# Unnecessary columns dropped
final_merge = final_merge.drop(columns=['Year-Of-Publication'])
final_merge

Dealing with < NA > values 

In [None]:
def replace_na(value):
    return None if pd.isna(value) else value

final_merge["Age"] = final_merge["Age"].apply(lambda x: [replace_na(item) for item in x] if isinstance(x, list) else replace_na(x))
final_merge["Number_of_Pages"] = final_merge["Number_of_Pages"].apply(lambda x: [replace_na(item) for item in x] if isinstance(x, list) else replace_na(x))
final_merge["Book-Rating"] = final_merge["Book-Rating"].apply(lambda x: [replace_na(item) for item in x] if isinstance(x, list) else replace_na(x))

In [None]:
final_merge

# Book Recommendation algorithm

In [None]:
# Function to create new dataframe by filtering rows that contain the user_id in the 'User-ID' lists.
def get_reviewed_by_user(user_id, df):
    df_reviewed_by_user = df[df['User-ID'].apply(lambda x: user_id in x if isinstance(x, list) else False)]
    return df_reviewed_by_user

In [None]:
def extract_book_details(df):
    columns_of_interest = ['ISBN', 'Publication-Range', 'Category', 'Book-Author', 'Mean-Rating', 'Publisher']
    df_books_details = df[columns_of_interest].drop_duplicates(subset='ISBN')
    return df_books_details

In [None]:
# Function to get a dataframe of ISBN, User-ID and Book-Rating exploding the data from the function get_user_isbns
def get_user_ratings(user_data, user_id):
    user_data_exploded = user_data.explode('User-ID').reset_index(drop=True)
    user_data_exploded['Book-Rating'] = user_data['Book-Rating'].explode().reset_index(drop=True)

    filtered_data = user_data_exploded[user_data_exploded['User-ID'] == user_id]
    user_rating = filtered_data[['ISBN', 'User-ID', 'Book-Rating']]
    return user_rating

In [None]:
def categorize_users_books(user_rating, book_details):
    
    high_rated_books = user_rating[user_rating['Book-Rating'] >= 6]['ISBN']
    low_rated_books = user_rating[user_rating['Book-Rating'] < 5]['ISBN']

    high_rating = book_details[book_details['ISBN'].isin(high_rated_books)]
    low_rating = book_details[book_details['ISBN'].isin(low_rated_books)]
    return high_rating, low_rating  

There are two primary recommendation techniques to consider: Content-Based Recommendation and Collaborative Filtering. Given that our dataset lacks a substantial number of reviews, Collaborative Filtering may not be the most appropriate choice. Consequently, the decision has been made to proceed with the Content-Based Recommendation approach. 
First, features will be set which represent the profile of the book and will assist in the decision-making for the recommendation algorithm. Consequently, 'Category', 'Publisher', and 'Book-Author' will be extracted into a new dataframe. This information needs to be encoded, and given the considerable size of the dataset, one-hot encoding is deemed unsuitable. Feature hashing is considered to be a solution. The defined categorical features will be transformed into a numerical format, ensuring that the data becomes suitable for the model. Due to the size of the dataset, feature hashing will be processed in batches.

In [None]:
recommendation_data = final_merge.copy()
recommendation_data = recommendation_data[['Category', 'Publisher', 'Book-Author']]

In [None]:
# Convert all numeric values to strings and handle null values
recommendation_data['Category'] = recommendation_data['Category'].apply(lambda x: str(x) if not isinstance(x, str) else x).fillna('Unknown')
recommendation_data['Publisher'] = recommendation_data['Publisher'].apply(lambda x: str(x) if not isinstance(x, str) else x).fillna('Unknown')
recommendation_data['Book-Author'] = recommendation_data['Book-Author'].apply(lambda x: str(x) if not isinstance(x, str) else x).fillna('Unknown')

In [None]:
# Defining the size of the hash space and initializing the FeatureHasher
n_features = 2**17 
hasher = FeatureHasher(n_features=n_features, input_type='string')

In [None]:
# Set directory to save batches, first it checks if the directory exists, and if not, it creates it
directory_path = 'hashed_batches' 
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Set the batch size
batch_size = 500  # Or another size fitting your memory capacity
n_batches = len(recommendation_data) // batch_size + 1

hashed_results = []

"The for-loop processes recommendation_data in batches. Within each batch, string values are converted to lists, which are then transformed using a hashing function."
"These hashed features are horizontally combined to form a single feature matrix for the batch."
"This matrix is then saved to disk as a sparse matrix, ensuring each saved batch retains the same number of rows as in the original dataset."

for i in range(n_batches):
    start_index = i * batch_size
    end_index = (i + 1) * batch_size

    batch = recommendation_data[start_index:end_index]

    categories_iterable = [[item] for item in batch['Category'].tolist()]
    publishers_iterable = [[item] for item in batch['Publisher'].tolist()]
    authors_iterable = [[item] for item in batch['Book-Author'].tolist()]

    # Apply hashing for each batch and keep the results as sparse matrices
    hashed_category = hasher.transform(categories_iterable)
    hashed_publisher = hasher.transform(publishers_iterable)
    hashed_author = hasher.transform(authors_iterable)

    # Combine hashed features for the batch (in sparse matrix format)
    combined_features_batch = hstack([hashed_category, hashed_publisher, hashed_author]) 

    # Save each batch to disk instead of keeping in memory
    batch_file_path = os.path.join(directory_path, f'batch_{i}.npz')  # safer with os.path.join
    save_npz(batch_file_path, combined_features_batch)

In [None]:
"Used to load previously saved batches of hashed data from disk, collect them into a list, and then concatenate them to produce a single combined sparse matrix of features"

# Set the batch size
batch_size = 500 
directory_path = 'hashed_batches' 
n_batches = len(recommendation_data) // batch_size + 1

hashed_results = []

# Load all batches
for i in range(n_batches):
    batch_file_path = os.path.join(directory_path, f'batch_{i}.npz')
    batch_data = load_npz(batch_file_path)
    hashed_results.append(batch_data)

# Concatenate all batches to get the final feature set (as a sparse matrix)
combined_features_sparse = vstack(hashed_results)  # This is still a sparse matrix

In [None]:
combined_features_sparse

The dimension of the sparse metrix has 271243 rows and 393216 columns. Where each row represents a unique book and each column represents a unique feature that has been hashed. 

In [None]:
def find_similar_books(book_id, X, book_ids, k, metric='cosine', show_distance=False):
    """
    Find k similar books based on their vector representation in X.
    
    Parameters:
    - book_id: The ID (ISBN) of the book of interest.
    - X: The sparse metrix representation of the books.
    - book_ids: A list of book IDs in the same order as they appear in X.
    - k: Number of similar books to find.
    - metric: The distance metric to use. Default is 'cosine'.
    - show_distance: Whether to show distance values. Default is False.
    
    Returns:
    A list of k book IDs that are similar to the given book_id.
    """
    
    # Find the index of the book_id in the book_ids list
    book_ind = book_ids.index(book_id)
    
    # Extract the vector representation of the book from X
    book_vec = X[book_ind]
    
    # Initialize and fit the kNN model
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    
    # Reshape the book vector and find its neighbors
    book_vec = book_vec.reshape(1, -1)
    neighbours = kNN.kneighbors(book_vec, return_distance=show_distance)
    
    # Extract the indices of the neighbours from the kNN result
    neighbour_indices = neighbours[0].tolist()
    
    # Convert these indices back to book IDs
    neighbour_ids = [book_ids[i] for i in neighbour_indices]
    
    # Remove the original book_id from the result
    neighbour_ids.remove(book_id)

    return neighbour_ids

In [None]:
# Set the user_id for which to get the book recommendation
user_id = 8

df_reviewed_by_user = get_reviewed_by_user(user_id, final_merge)
df_books_details = extract_book_details(df_reviewed_by_user)
user_rating = get_user_ratings(df_reviewed_by_user, user_id)
high_rating, low_rating  = categorize_users_books(user_rating, df_books_details)

In [None]:
# Get a list of highly rated books for a selected user
book_ids = high_rating['ISBN']
book_ids.tolist()

In [None]:
isbns = final_merge['ISBN']
isbn_list = isbns.to_list()
#book_id = "0002005018"

In [None]:
# Create a sorted data frame of books based on their average rating and the number of reviews they have received
# Will be used for recommending books to users who have not yet rated any books

final_merge['length'] = final_merge['Book-Rating'].apply(len)
sorted_df = final_merge.sort_values(by=['Mean-Rating', 'length'], ascending=[False, False])
sorted_df = sorted_df.drop(columns=['length'])

sorted_df

In [None]:
# Function to get recommendation for books. It checks if a user has rated any books.
# If they haven't, it provides general recommendations based on the top-rated books.
# If they have, it offers personalized recommendations based on the user's high-rated books.

def get_recommendation(user_high_rating_list, general_recommended_books):
    all_books = []
    if len(user_high_rating_list) == 0:
        
        all_books = general_recommended_books['ISBN'].head(3).tolist()
    else:    
        all_books = all_books + find_similar_books(isbn, combined_features_sparse, isbn_list, 5)
    return all_books        

In [None]:
found_books = get_recommendation(book_ids, sorted_df)

In [None]:
found_books_information = final_merge[final_merge['ISBN'].isin(found_books)]
found_books_information

In [None]:
# Final recommended book for the set user
recommended_books = found_books_information.sort_values(by='Mean-Rating', ascending=False).head(3)
recommended_books