Plan:
- create user profile using all books that match user preferences
- user profile will be a tfidf vector of the book description+other variables
- recommend books that most closely match the user profile vector (from any genre)

In [15]:
# Loading the Data
import pandas as pd
import glob
import os
import numpy as np

directory = './dataset'
csv_files = glob.glob(os.path.join(directory, 'book*.csv'))
dataframes = [pd.read_csv(file) for file in csv_files]
book_data = pd.concat(dataframes, ignore_index=True)
print(book_data.head())

        Id                                               Name  \
0  1900511                                         Barbarossa   
1  1900512  Collector's Guide to German World War II: Comb...   
2  1900514                               Images of Barbarossa   
3  1900520        Romania After 2000: Five New Romanian Plays   
4  1900521           Global Foreigners: An Anthology of Plays   

                  Authors        ISBN  Rating  PublishYear  PublishMonth  \
0      Christopher Ailsby  1840138009     3.0         2007             4   
1      Christopher Ailsby  0781802253     0.0         1994             7   
2      Christopher Ailsby  0711028257     3.5         2001             1   
3  Daniel Charles Gerould  0595436560     4.0         2007             9   
4        Saviana Stănescu  1905422423     4.6         2006            12   

   PublishDay                                    Publisher RatingDist5  ...  \
0           1                               New Line Books         5:0  .

Duplicate and missing values need to be removed before training the model, though there should be none in this dataset.

In [16]:
# Drop duplicates and missing values
book_data = book_data.drop_duplicates()
print(book_data.isna().sum())
book_data.dropna(subset=['Publisher', 'Description'], inplace=True)
book_data['PagesNumber'] = book_data['pagesNumber'].combine_first(book_data['PagesNumber'])
book_data.drop(columns=['pagesNumber', 'ISBN'], inplace=True)
book_data['Content'] = book_data[['Name', 'Authors', 'PublishYear', 'Publisher', 'Description']].astype(str).apply(lambda x: ' '.join(x), axis=1)
book_data

Id                             0
Name                           0
Authors                        0
ISBN                        5922
Rating                         0
PublishYear                    0
PublishMonth                   0
PublishDay                     0
Publisher                  17823
RatingDist5                    0
RatingDist4                    0
RatingDist3                    0
RatingDist2                    0
RatingDist1                    0
RatingDistTotal                0
CountsOfReview                 0
Language                 1598399
PagesNumber               834966
Description               679010
pagesNumber              1015232
Count of text reviews    1440501
dtype: int64


Unnamed: 0,Id,Name,Authors,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,PagesNumber,Description,Count of text reviews,Content
0,1900511,Barbarossa,Christopher Ailsby,3.00,2007,4,1,New Line Books,5:0,4:0,3:1,2:0,1:0,total:1,0,,192.0,"On 22 June 1941, Adolf Hitler launched Operati...",,Barbarossa Christopher Ailsby 2007 New Line Bo...
2,1900514,Images of Barbarossa,Christopher Ailsby,3.50,2001,1,25,Ian Allan Ltd,5:0,4:5,3:2,2:1,1:0,total:8,0,,256.0,"On 22 June 1941, Adolf Hitler launched Operati...",,Images of Barbarossa Christopher Ailsby 2001 I...
3,1900520,Romania After 2000: Five New Romanian Plays,Daniel Charles Gerould,4.00,2007,9,1,Martin E. Segal Theatre Center Publications,5:1,4:4,3:1,2:0,1:0,total:6,0,,226.0,The first anthology of new Romanian Drama publ...,,Romania After 2000: Five New Romanian Plays Da...
4,1900521,Global Foreigners: An Anthology of Plays,Saviana Stănescu,4.60,2006,12,7,Seagull Books,5:4,4:0,3:1,2:0,1:0,total:5,0,,320.0,"In Waxing West, Daniella, newly arrived in the...",,Global Foreigners: An Anthology of Plays Savia...
5,1900525,Diary of a Clone,Saviana Stănescu,4.80,2003,1,1,Meeting Eyes Bindery,5:4,4:1,3:0,2:0,1:0,total:5,0,,66.0,Poetry. Translation. DIARY OF A CLONE is a sma...,,Diary of a Clone Saviana Stănescu 2003 Meeting...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1850303,1499980,The O'Brien Book of Irish Fairy Tales & Legends,Una Leavy,4.24,1996,9,10,O'Brien Press,5:39,4:44,3:8,2:4,1:0,total:95,1,,96.0,Irish fairy tales and legends are full of ench...,1.0,The O'Brien Book of Irish Fairy Tales & Legend...
1850305,1499988,Irish Folk and Fairy Tales Omnibus Edition,Michael Scott,4.19,1989,24,8,Sphere,5:140,4:112,3:42,2:13,1:4,total:311,10,,637.0,"Here, collected in one volume, are tales and l...",10.0,Irish Folk and Fairy Tales Omnibus Edition Mic...
1850306,1499990,Robin Hood: The Shaping of the Legend,Jeffrey L. Singman,3.00,1998,23,7,Praeger,5:0,4:0,3:1,2:0,1:0,total:1,0,,224.0,Among the narrative traditions of the Middle A...,0.0,Robin Hood: The Shaping of the Legend Jeffrey ...
1850308,1499992,Competing on Value,Mack Hanan,3.50,1991,22,4,Amacom,5:2,4:2,3:2,2:2,1:0,total:8,1,,220.0,Presents a new approach to selling that emphas...,1.0,Competing on Value Mack Hanan 1991 Amacom Pres...


In [17]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

In [18]:
book_data['Content'] = book_data['Content'].apply(preprocess_text)
book_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1163326 entries, 0 to 1850309
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Id                     1163326 non-null  int64  
 1   Name                   1163326 non-null  object 
 2   Authors                1163326 non-null  object 
 3   Rating                 1163326 non-null  float64
 4   PublishYear            1163326 non-null  int64  
 5   PublishMonth           1163326 non-null  int64  
 6   PublishDay             1163326 non-null  int64  
 7   Publisher              1163326 non-null  object 
 8   RatingDist5            1163326 non-null  object 
 9   RatingDist4            1163326 non-null  object 
 10  RatingDist3            1163326 non-null  object 
 11  RatingDist2            1163326 non-null  object 
 12  RatingDist1            1163326 non-null  object 
 13  RatingDistTotal        1163326 non-null  object 
 14  CountsOfReview    

In [19]:
import joblib

vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [21]:
from scipy.sparse import vstack

def process_in_chunks(df, chunk_size=10000):
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)
    sparse_matrices = []

    for i in range(num_chunks):
        chunk = df.iloc[i*chunk_size : (i+1)*chunk_size]
        sparse_matrix = vectorizer.transform(chunk['Content'])
        sparse_matrices.append(sparse_matrix)

    combined_sparse_matrix = vstack(sparse_matrices)
    return combined_sparse_matrix

book_vectors = process_in_chunks(book_data, chunk_size=10000)
book_vectors = book_vectors.astype('float32')

We now use our genre classifier to assign each book with a genre.

In [22]:
from tensorflow import keras

model = keras.models.load_model('./dense32model8972.h5')

model.summary()



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 32)                320032    
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 batch_normalization_2 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dense_5 (Dense)             (None, 10)                330       
                                                                 
Total params: 320490 (1.22 MB)
Trainable params: 320426 (1.22 MB)
Non-trainable params: 64 (256.00 Byte)
_________________________________________________________________


In [28]:
genre_pred = model.predict(book_vectors)



In [31]:
genres = ['fantasy', 'science', 'crime', 'history', 'horror', 'thriller', 'psychology', 'romance', 'sports', 'travel']
genre_pred.shape
predicted_classes = np.argmax(genre_pred, axis=1)
predicted_genres = [genres[class_idx] for class_idx in predicted_classes]
book_data['Genre'] = predicted_genres
book_data.head()

Unnamed: 0,Id,Name,Authors,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,...,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,PagesNumber,Description,Count of text reviews,Content,Genre
0,1900511,Barbarossa,Christopher Ailsby,3.0,2007,4,1,New Line Books,5:0,4:0,...,2:0,1:0,total:1,0,,192.0,"On 22 June 1941, Adolf Hitler launched Operati...",,barbarossa christoph ailsbi 2007 new line book...,fantasy
2,1900514,Images of Barbarossa,Christopher Ailsby,3.5,2001,1,25,Ian Allan Ltd,5:0,4:5,...,2:1,1:0,total:8,0,,256.0,"On 22 June 1941, Adolf Hitler launched Operati...",,imag of barbarossa christoph ailsbi 2001 ian a...,fantasy
3,1900520,Romania After 2000: Five New Romanian Plays,Daniel Charles Gerould,4.0,2007,9,1,Martin E. Segal Theatre Center Publications,5:1,4:4,...,2:0,1:0,total:6,0,,226.0,The first anthology of new Romanian Drama publ...,,romania after 2000 : five new romanian play da...,sports
4,1900521,Global Foreigners: An Anthology of Plays,Saviana Stănescu,4.6,2006,12,7,Seagull Books,5:4,4:0,...,2:0,1:0,total:5,0,,320.0,"In Waxing West, Daniella, newly arrived in the...",,global foreign : an antholog of play saviana s...,psychology
5,1900525,Diary of a Clone,Saviana Stănescu,4.8,2003,1,1,Meeting Eyes Bindery,5:4,4:1,...,2:0,1:0,total:5,0,,66.0,Poetry. Translation. DIARY OF A CLONE is a sma...,,diari of a clone saviana stănescu 2003 meet ey...,fantasy


In [38]:
def generate_user_profile(user_pref, df, vectorizer):
    authors = '|'.join(user_pref['Authors'])
    genre_matches = df[df['Genre'].isin(user_pref['Genres'])]
    author_matches = df[df['Authors'].str.contains(authors, case=False, na=False)]
    matches = pd.concat([genre_matches, author_matches])
    # Drop duplicate rows if necessary
    matches = matches.drop_duplicates().reset_index(drop=True)
    matches['PagesNumber'] = pd.to_numeric(matches['PagesNumber'], errors='coerce')
    matches = matches[abs(matches['PagesNumber'] - user_pref['Length']) <= 100]

    document = ' '.join(matches['Content'])
    return vectorizer.transform([document]).toarray()

user_pref = {
    'Genres': ['fantasy', 'history', 'sport'],
    'Authors': ['J.K Rowling', 'Rick Riordan', 'Enid Blyton'],
    'Length': 750
}
user_profile = generate_user_profile(user_pref, book_data, vectorizer)

In [63]:
def jaccard_similarity(vec1, vec2):
    intersection = np.sum(np.minimum(vec1, vec2))
    union = np.sum(np.maximum(vec1, vec2))
    return intersection / union if union != 0 else 0

In [86]:
def get_top_15(book_data, book_vectors, user_profile, threshold):
    max_review_count = max(book_data['CountsOfReview'])
    similarities = {}
    for index, vector in enumerate(book_vectors):
        review_count = book_data['CountsOfReview'].iloc[index]
        # books must have 1000 reviews
        if review_count < threshold:
            continue
        similarity = jaccard_similarity(user_profile, vector.toarray())
        normalized_reviews = (review_count)/(max_review_count)
        # weight the similarity by their review count
        similarities[index] = (0.8* similarity) + (0.2 * normalized_reviews)

    sorted_books = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:15]
    sorted_book_names = []
    for book in sorted_books:
        name = book_data.iloc[book[0]]['Name']
        review_count = book_data.iloc[book[0]]['CountsOfReview']
        sorted_book_names.append((name, book[1], review_count))
    return sorted_book_names

top_15 = get_top_15(book_data, book_vectors, user_profile, 1000)

print(top_15)

[('The Hunger Games (The Hunger Games, #1)', 0.24030500172317318, 154447), ('The Help', 0.15493479863738488, 76040), ('The Guernsey Literary and Potato Peel Pie Society', 0.1136515983729588, 37690), ('Breaking Dawn (Twilight, #4)', 0.1086330417265057, 43652), ('The Girl with the Dragon Tattoo (Millennium, #1)', 0.09800682275850958, 52225), ("The Wise Man's Fear (The Kingkiller Chronicle, #2)", 0.08836198245272052, 16523), ('The Shack', 0.085208623011604, 29204), ('If I Stay (If I Stay, #1)', 0.08190066959408698, 28961), ('The Host (The Host, #1)', 0.08100641727715584, 37517), ('People of the Book', 0.07960682503571334, 10076), ('The Snowball: Warren Buffett and the Business of Life', 0.07958423741731037, 1137), ('With the Old Breed: At Peleliu and Okinawa', 0.07930117088942179, 1135), ('Cutting for Stone', 0.07929683069346098, 21865), ('Barracoon: The Story of the Last "Black Cargo"', 0.07863800738304008, 2071), ('Autobiography of a Yogi', 0.07816734453822494, 1807)]
