In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
df = pd.read_csv('dataset/book_data.csv')

In [47]:
df.fillna(value='', inplace=True)
df.drop_duplicates(subset=['book_title'], inplace=True)

In [48]:
df['genres'] = df['genres'].apply(lambda x: x.split('|'))

In [49]:
df['book_authors'] = df['book_authors'].apply(lambda x:x.split('|'))

In [50]:
print(df['book_pages'].unique())

['374 pages' '870 pages' '324 pages' ... '1043 pages' '1613 pages'
 '1776 pages']


In [51]:
df['book_pages'] = pd.to_numeric(df['book_pages'].str.replace(' pages', ''), errors='coerce')


In [9]:
df['book_pages'] = df['book_pages'].astype(str).str.replace(' pages', '').replace('', np.nan).astype(float)

In [11]:
df['book_title'].count()

48483

In [12]:
vectorizer = TfidfVectorizer(stop_words = 'english')

In [13]:
vectorizer

In [14]:
books_desc_vector = vectorizer.fit_transform(df['book_desc'].values.astype(str))

In [15]:
books_desc_vector.shape

(48483, 283853)

In [16]:
all_in_vectors = vectorizer.get_feature_names_out()

In [17]:
all_in_vectors.shape

(283853,)

In [18]:
cos_similarities = cosine_similarity(books_desc_vector)
cos_similarities.shape

(48483, 48483)

In [19]:
cos_similarities[:20]

array([[1.        , 0.00255534, 0.01704172, ..., 0.        , 0.01387131,
        0.0069516 ],
       [0.00255534, 1.        , 0.        , ..., 0.        , 0.00974877,
        0.        ],
       [0.01704172, 0.        , 1.        , ..., 0.        , 0.03106394,
        0.02368588],
       ...,
       [0.01874136, 0.00253853, 0.01766921, ..., 0.        , 0.01297754,
        0.0240523 ],
       [0.02510485, 0.        , 0.0388411 , ..., 0.        , 0.01524865,
        0.        ],
       [0.02402623, 0.0142949 , 0.        , ..., 0.        , 0.00201227,
        0.        ]])

In [20]:
df2 = df[['book_title', 'book_authors', 'book_desc']]

In [21]:
indices = pd.Series(df2.index, index=df['book_title']).drop_duplicates()

In [22]:
indices

book_title
The Hunger Games                                                                                                   0
Harry Potter and the Order of the Phoenix                                                                          1
To Kill a Mockingbird                                                                                              2
Pride and Prejudice                                                                                                3
Twilight                                                                                                           4
                                                                                                               ...  
Taking the Field: A Fan's Quest to Run the Team He Loves                                                       54296
The Baseball Talmud: Koufax, Greenberg, and the Quest for the Ultimate Jewish All-Star Team                    54297
Wilpon's Folly - The Story of a Man, His Fortune, and

In [23]:
def give_recommendation(title, cos_similarities = cos_similarities):
    book_index = indices[title]
    similarity_scores = list(enumerate(cos_similarities[book_index]))
    similarity_scores = sorted(similarity_scores, key = lambda x:x[1], reverse = True)
    top_books = [i[0] for i in similarity_scores[1:5]]
    recommended_books = df2.iloc[top_books]['book_title'].tolist()
    return recommended_books

In [24]:
recs = give_recommendation('1984')
print(recs)

['Snow Ride', 'Premeditated', 'Hiding in the Shadows', 'The Rich Are Different']


### Splitting the genres separated by '|' into a list of genres 

In [21]:
df.columns

Index(['book_authors', 'book_desc', 'book_edition', 'book_format', 'book_isbn',
       'book_pages', 'book_rating', 'book_rating_count', 'book_review_count',
       'book_title', 'genres', 'image_url'],
      dtype='object')

In [25]:
import string

In [26]:
string.ascii_uppercase

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [27]:
df['name_feature'] = ['name_{}'.format(x) for x in df['book_title']]

In [28]:
def raw_text_to_feature(s, sep=' ', join_sep='x', to_include=string.ascii_lowercase, to_include2 = string.ascii_uppercase):
    def filter_word(word):
        return ''.join([c for c in word if (c in to_include or to_include2)])
    return join_sep.join([filter_word(word) for word in s.split(sep)])

df['name_feature'] = df['book_title'].apply(raw_text_to_feature)

In [29]:
df['corpus'] = (pd.Series(df[['book_desc', 'name_feature']]
                .fillna('')
                .values.tolist()
                ).str.join(' '))
df['corpus'] = df['corpus'].fillna('')

In [30]:
df['corpus'].count(), df['name_feature'].count()

(48483, 48483)

In [31]:
df['corpus'].count()

48483

In [32]:
vectorizer2 = TfidfVectorizer(stop_words = 'english')
books_desc_vector2 = vectorizer.fit_transform(df['corpus'].values.astype(str))
all_in_vectors2 = vectorizer.get_feature_names_out()
all_in_vectors2.shape, books_desc_vector2.shape

((312839,), (48483, 312839))

In [33]:
cos_similarities2 = cosine_similarity(books_desc_vector2)

In [34]:
cos_similarities2.shape

(48483, 48483)

In [35]:
def give_recommendation_name_desc(title, cos_similarities = cos_similarities2):
    book_index = indices[title]
    similarity_scores = list(enumerate(cos_similarities[book_index]))
    similarity_scores = sorted(similarity_scores, key = lambda x:x[1], reverse = True)
    top_books = [i[0] for i in similarity_scores[1:10]]
    recommended_books = [df2.iloc[top_books]['book_title'].tolist()]
    print(type(recommended_books[0]))
    res = []
    for i, book in enumerate(recommended_books[0]):
        res.append((book, similarity_scores[i][-1]))
    return res

In [36]:
recs2 = give_recommendation_name_desc('To Kill a Mockingbird')
print(recs2)

<class 'list'>
[('The Sea, the Sea', 1.0), ('Notorious Nora', 0.3004609644286415), ("It's Not about the Bike: My Journey Back to Life", 0.14122755858510808), ('Just Ella', 0.1268142394345682), ("A Hustler's Promise 2", 0.12224691643904045), ('Rush', 0.11534686406408201), ('الحب في المنفى', 0.11319512799862641), ('The Criminal Mastermind Collection, Bks 1-3', 0.11239470741718388), ('Romiette and Julio', 0.11014456050110508)]


In [41]:
type(cos_similarities), type(books_desc_vector), type(cos_similarities2), type(books_desc_vector2)

(numpy.ndarray,
 scipy.sparse.csr.csr_matrix,
 numpy.ndarray,
 scipy.sparse.csr.csr_matrix)

In [42]:
# Convert the numpy arrays to dataframes and save to separate Excel files
pd.DataFrame(cos_similarities).to_excel('cos_similarities.xlsx', index=False)
pd.DataFrame(books_desc_vector.todense()).to_excel('vectorized_books_data.xlsx', index=False)

ValueError: This sheet is too large! Your sheet size is: 48483, 48483 Max sheet size is: 1048576, 16384

In [None]:
arr1 = pd.read_excel('cos_similarities.xlsx').values.squeeze()
sp_mat1 = sp.csr_matrix(pd.read_excel('vectorized_books_data.xlsx').values)

In [None]:
# Load the numpy arrays from the Excel files
arr1 = pd.read_excel('array1.xlsx').values.squeeze()
arr2 = pd.read_excel('array2.xlsx').values.squeeze()
sp_mat1 = sp.csr_matrix(pd.read_excel('sparse_mat1.xlsx').values)
sp_mat2 = sp.csr_matrix(pd.read_excel('sparse_mat2.xlsx').values)

In [None]:
# Convert the sparse matrices to dataframes and save to separate Excel files
pd.DataFrame(sp_mat1.todense()).to_excel('sparse_mat1.xlsx', index=False)
pd.DataFrame(sp_mat2.todense()).to_excel('sparse_mat2.xlsx', index=False)

In [43]:
data = {'cos_similarities':cos_similarities}

In [None]:
'books_TFID_vectorized':books_desc_vector,
        'cos_similarities_title_desc':cos_similarities2,
        'books_TFID_vectorized_title_desc':books_desc_vector2,
        'df_corpus': df2

In [44]:
import pickle

with open('cos_sim_basic_50000.pickle', 'wb') as f:
    pickle.dump(data, f)

MemoryError: 

## Considering other than book_desc for content based recommendation

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
df = df[['book_title', 'book_authors', 'book_desc', 'book_pages', 'book_review_count']]


In [None]:
desc_pipeline = Pipeline([('vect', CountVectorizer(stop_words='english'))])
pages_pipeline = Pipeline([('scaler', StandardScaler())])
authors_pipeline = Pipeline([('tolist', ListToStringTransformer()),
                             ('onehot', OneHotEncoder(handle_unknown='ignore'))])
reviewcount_pipeline = Pipeline([('scaler', StandardScaler())])


In [None]:
class ListToStringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [' '.join(x) for x in X]

In [None]:
df['book_pages'] = df['book_pages'].reshape(-1, 1)
df['book_pages'].shape

In [None]:
preprocessor = ColumnTransformer([('desc', desc_pipeline, ['book_desc']),
                                  ('pages', pages_pipeline, ['book_pages']),
                                  ('authors', authors_pipeline, ['book_authors']),
                                  ('reviewcount', reviewcount_pipeline, ['book_review_count'])])

In [None]:
features = preprocessor.fit_transform(df)

In [54]:
from sklearn.decomposition import PCA

In [56]:
import nltk
from nltk.corpus import stopwords

# download the stop words
nltk.download('stopwords')

# remove the stop words from the documents
stop_words = set(stopwords.words('english'))
df['genres'] = df['genres'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# transform the genres into a TF-IDF matrix
X = vectorizer.fit_transform(df['genres'])

# perform PCA to reduce the dimensionality of the TF-IDF matrix
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

# assign a color to each genre
colors = plt.cm.tab10(range(len(genres)))

# create a scatter plot with the genre as the color
fig, ax = plt.subplots(figsize=(10, 8))
for i, genre in enumerate(genres):
    mask = df['genres'].str.contains(genre)
    ax.scatter(X_pca[mask, 0], X_pca[mask, 1], c=colors[i], label=genre, alpha=0.5)
ax.legend()

# set the labels
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_title('Clusters based on similar genres')

# show the plot
plt.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sajjan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [58]:
unique_genres = set(' '.join(df['genres']).split())
print(unique_genres)

# transform the genres into a TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['genres'])

# perform PCA to reduce the dimensionality of the TF-IDF matrix
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X.toarray())

# create a scatter plot of the principal components
plt.scatter(principal_components[:,0], principal_components[:,1])
plt.show()

{'g', 'Z', '.', '9', 'G', 'U', 'b', 'Q', 'N', '8', '0', 'l', 'w', 'R', 'z', 'K', 'p', 'r', 'J', '6', '1', 'e', 'L', '5', 'B', '4', 'q', 'E', 'c', 'j', 'u', 'H', '3', 'P', 'F', '2', 'C', 'v', 'é', '7', 'W', 'f', 'x', 'X', 'n', 'k', 'V', 'h'}


ValueError: empty vocabulary; perhaps the documents only contain stop words