 #                             BOOK RECOMMENDATION MODEL

### LOADING AND EXPLORING THE DATA

In [2]:
import pandas as pd
df=pd.read_csv('BooksDatasetClean.csv')
df.head(10)

Unnamed: 0,Title,Authors,Description,Category,Publisher,Price Starting With ($),Publish Date (Month),Publish Date (Year)
0,Goat Brothers,"By Colton, Larry",,"History , General",Doubleday,8.79,January,1993
1,The Missing Person,"By Grumbach, Doris",,"Fiction , General",Putnam Pub Group,4.99,March,1981
2,Don't Eat Your Heart Out Cookbook,"By Piscatella, Joseph C.",,"Cooking , Reference",Workman Pub Co,4.99,September,1983
3,When Your Corporate Umbrella Begins to Leak: A...,"By Davis, Paul D.",,,Natl Pr Books,4.99,April,1991
4,Amy Spangler's Breastfeeding : A Parent's Guide,"By Spangler, Amy",,,Amy Spangler,5.32,February,1997
5,The Foundation of Leadership: Enduring Princip...,"By Short, Bo",,,Excalibur Press,6.06,January,1997
6,Chicken Soup for the Soul: 101 Stories to Open...,"By Canfield, Jack (COM) and Hansen, Mark Victo...",,"Self-help , Personal Growth , Self-Esteem",Health Communications Inc,4.99,May,1993
7,Journey Through Heartsongs,"By Stepanek, Mattie J. T.",Collects poems written by the eleven-year-old ...,"Poetry , General",VSP Books,19.96,September,2001
8,In Search of Melancholy Baby,"By Aksyonov, Vassily, Heim, Michael Henry, and...",The Russian author offers an affectionate chro...,"Biography & Autobiography , General",Random House,4.99,June,1987
9,Christmas Cookies,"By Eakin, Katherine M. and Deaman, Joane (EDT)",,"Cooking , General",Oxmoor House,12.98,June,1986


In [3]:
df.isnull().sum()

Title                          0
Authors                        0
Description                32859
Category                   26161
Publisher                      8
Price Starting With ($)        0
Publish Date (Month)           0
Publish Date (Year)            0
dtype: int64

In [4]:
pd.DataFrame(df['Category'].value_counts())
#2938 unique categories

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
"Fiction , General",2549
"Fiction , Literary",1709
"Fiction , Mystery & Detective , General",1690
"Fiction , Thrillers , General",1115
"Fiction , Romance , Contemporary",1074
...,...
"Nature , Ecosystems & Habitats , Mountains",1
"Language Arts & Disciplines , Etymology",1
"Religion , Antiquities & Archaeology",1
"Law , Legal Education",1


In [5]:
pd.DataFrame(df['Authors'].value_counts())
#39279 unique authors

Unnamed: 0_level_0,count
Authors,Unnamed: 1_level_1
By,1043
"By Roberts, Nora",195
By Time-Life Books,172
By unknown,122
"By ""Better Homes and Gardens""",121
...,...
"By Chan, Kit",1
"By McCannon, John and Jordan, Pam",1
"By Pettit, Stephen",1
"By Shone, Rob (ILT), Royston, Angela, and Forsey, Chris (ILT)",1


In [6]:
df1=df.dropna(inplace=True)

In [7]:
df1=df.reset_index(drop=True)

In [8]:
df1.to_csv('merged_books_clean.csv', index=False)

In [9]:
#Explore the books dataset
print("Reviews Dataset:")
print(df.info())

Reviews Dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 65296 entries, 7 to 103062
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Title                    65296 non-null  object 
 1   Authors                  65296 non-null  object 
 2   Description              65296 non-null  object 
 3   Category                 65296 non-null  object 
 4   Publisher                65296 non-null  object 
 5   Price Starting With ($)  65296 non-null  float64
 6   Publish Date (Month)     65296 non-null  object 
 7   Publish Date (Year)      65296 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 4.5+ MB
None


In [10]:
df.isnull().sum()

Title                      0
Authors                    0
Description                0
Category                   0
Publisher                  0
Price Starting With ($)    0
Publish Date (Month)       0
Publish Date (Year)        0
dtype: int64

## PREPROCESSING USING SPACY

In [11]:
import spacy
import re

# Load the English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm", disable=['ner'])  # Disabling Named Entity Recognition for faster processing

def preprocess_text(text):
    # Parse the text using SpaCy
    doc = nlp(text)
    
    # Lemmatize and remove stopwords and special characters
    preprocessed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space and not token.is_digit]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(preprocessed_tokens)
    
    # Remove remaining special characters using regex
    preprocessed_text = re.sub(r'[^a-zA-Z\s]', '', preprocessed_text)
    
    return preprocessed_text

# Example usage:
text = "@#$ i love pancakes and honey"
preprocessed_text = preprocess_text(text)
print("Original Text:", text)
print("Preprocessed Text:", preprocessed_text)


Original Text: @#$ i love pancakes and honey
Preprocessed Text:  love pancake honey


## CREATING A WORD SOUP

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load the dataset containing information about books
books_df = pd.read_csv('merged_books_clean.csv')

# Apply preprocessing to all textual features
text_features = ['Category','Authors','Title']
for feature in text_features:
    books_df[feature] = books_df[feature].fillna('').apply(preprocess_text)

In [13]:
import pandas as pd
pd.DataFrame(books_df[feature]).head(10)

Unnamed: 0,Title
0,Journey heartsong
1,Search Melancholy Baby
2,Dieter Guide Weight Loss sex
3,germ Biological Weapons America Secret War
4,Good Book read Bible Mind heart
5,shoutin
6,Hill Rat blow Lid Congress
7,personality cat
8,Betrayal Clinton Administration Undermined Ame...
9,Shadow Song


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Combine all textual features into a single feature
books_df['combined_text'] =books_df['Category'] + ' ' + books_df['Authors'] 

# Define chunk size
chunk_size = 10000

# Initialize an empty array to store the similarity scores
similarity_scores = []

# Initialize variable to keep track of maximum number of features
max_features = 0

# Iterate over the dataset in chunks
for i in range(0, len(books_df), chunk_size):
    # Extract chunk of data
    chunk = books_df['combined_text'][i:i+chunk_size].tolist()

    # Use TF-IDF vectorization to convert textual features into numerical representations
    tfidf_vectorizer = TfidfVectorizer()
    combined_tfidf = tfidf_vectorizer.fit_transform(chunk)

    # Keep track of maximum number of features
    max_features = max(max_features, combined_tfidf.shape[1])

    # Calculate similarity scores within the chunk
    chunk_similarity_matrix = cosine_similarity(combined_tfidf, combined_tfidf)
    
    # Append similarity scores to the list
    similarity_scores.append(chunk_similarity_matrix)

# Pad smaller chunks with zeros along dimension 1
for i in range(len(similarity_scores)):
    if similarity_scores[i].shape[1] < max_features:
        pad_width = ((0, 0), (0, max_features - similarity_scores[i].shape[1]))
        similarity_scores[i] = np.pad(similarity_scores[i], pad_width, mode='constant')

# Concatenate similarity scores from all chunks
similarity_matrix = np.concatenate(similarity_scores, axis=0)
np.save('similarity_matrix.npy', similarity_matrix)


## content based recommendation model

In [20]:

# Implement a function to recommend similar items based on user input
def recommend_books_based_on_input(user_input, similarity_matrix, books_df, top_n=5):
    # Preprocess user input
    user_input = preprocess_text(user_input)

    # Find the index of the book that matches the user input
    matching_indices = books_df.index[books_df['combined_text'].str.contains(user_input, case=False)]

    # Calculate the average similarity scores for matching books
    average_similarity_scores = []
    for idx in matching_indices:
        similarity_scores = similarity_matrix[idx]
        average_similarity_scores.append(sum(similarity_scores) / len(similarity_scores))

    # Sort matching books by average similarity scores
    sorted_indices = [x for _, x in sorted(zip(average_similarity_scores, matching_indices), reverse=True)]

    # Recommend top N similar books
    recommended_books = books_df.iloc[sorted_indices[:top_n]]
    return recommended_books[['Title', 'Authors']]


user_input = input("Please enter a keyword or phrase: ")
recommended_books = recommend_books_based_on_input(user_input, similarity_matrix, books_df)
print(recommended_books)

                                                   Title      Authors
35617                   Baby Bear Treasury Stories young             
34638                                       Children Men    James P D
4168                         Imzadi Star Trek generation  David Peter
19276  restoration Star Trek New Frontier Excalibur Book  David Peter
44791                   fire High Star Trek New Frontier  David Peter


In [None]:
import joblib



# Optionally, save other necessary data
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
np.save('similarity_matrix.npy', similarity_matrix)
books_df.to_csv('books_data.csv', index=False)
