In [None]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
df = pd.read_csv('BOOK.csv')

In [None]:
df['Genres'] = df['Genres'].fillna("[]").astype(str)
df['Genres'] = df['Genres'].apply(lambda x: ast.literal_eval(x) if x.strip() != "" else [])
df['Description'] = df['Description'].fillna("").astype(str)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Book_Name,Author,Rating,Number of Reviews,Price,Description,Listening Time (min),Genres
0,0,Think Like a Monk: The Secret of How to Harnes...,Jay Shetty,4.9,371.0,10080,"Over the past three years, Jay Shetty has beco...",654,"[Audible Audiobooks & Originals, Personal Succ..."
1,1,Ikigai: The Japanese Secret to a Long and Happ...,Héctor García,4.6,3682.0,615,Brought to you by Penguin.,203,"[Audible Audiobooks & Originals, Meditation, S..."
2,2,The Subtle Art of Not Giving a F*ck: A Counter...,Mark Manson,4.4,20306.0,10378,"In this generation-defining self-help guide, a...",317,"[Audible Audiobooks & Originals, Personal Succ..."
3,3,Atomic Habits: An Easy and Proven Way to Build...,James Clear,4.6,4678.0,888,Brought to you by Penguin.,335,"[Audible Audiobooks & Originals, Psychology, S..."
4,4,Life's Amazing Secrets: How to Find Balance an...,Gaur Gopal Das,4.6,4308.0,1005,"Stop going through life, Start growing throug...",385,"[Audible Audiobooks & Originals, Literary Essa..."


In [None]:
# Identify missing genres
missing_mask = df['Genres'].apply(lambda x: not x)
missing_mask.sum()

np.int64(2363)

In [None]:
known_df = df[~missing_mask].copy()
missing_df = df[missing_mask].copy()

# Remove empty descriptions
known_df = known_df[known_df['Description'].str.strip() != ""]
missing_df = missing_df[missing_df['Description'].str.strip() != ""]

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
combined_descriptions = pd.concat([known_df['Description'], missing_df['Description']])
tfidf = vectorizer.fit_transform(combined_descriptions)

In [None]:
# Separate vectors
known_tfidf = tfidf[:len(known_df)]
missing_tfidf = tfidf[len(known_df):]

In [None]:

# Similarity
sim = cosine_similarity(missing_tfidf, known_tfidf)
most_similar = sim.argmax(axis=1)

In [None]:
# Assign genres from most similar known description
missing_df['Genres'] = known_df.iloc[most_similar]['Genres'].values

In [None]:
# Merge
datas = pd.concat([known_df, missing_df]).sort_index()

In [None]:
datas['Genres'].value_counts()

Unnamed: 0_level_0,count
Genres,Unnamed: 1_level_1
[Personal Success],1169
"[Science Fiction Anthologies & Short Stories, Fiction Short Stories]",629
[Classic Literature],31
"[Leadership, Business Careers]",25
[Contemporary Romance],21
...,...
"[Women Sleuth Mysteries, International Mystery & Crime, Crime Thrillers]",1
"[Audible Audiobooks & Originals, English Language Learning, Language Learning & Teaching]",1
"[Creativity & Genius, Self-Esteem, Personal Success]",1
"[Classic Action & Adventure, Classic Literature]",1


In [None]:
datas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4462 entries, 0 to 4463
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            4462 non-null   int64  
 1   Book_Name             4462 non-null   object 
 2   Author                4462 non-null   object 
 3   Rating                4462 non-null   float64
 4   Number of Reviews     4041 non-null   float64
 5   Price                 4462 non-null   int64  
 6   Description           4462 non-null   object 
 7   Listening Time (min)  4462 non-null   int64  
 8   Genres                4462 non-null   object 
dtypes: float64(2), int64(3), object(4)
memory usage: 348.6+ KB


In [None]:
datas['Number of Reviews'].fillna(datas['Number of Reviews'].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  datas['Number of Reviews'].fillna(datas['Number of Reviews'].median(),inplace=True)


In [None]:
datas['Description'][0]

'Over the past three years, Jay Shetty has become one of the world’s most popular influencers. One of his clips was the most watched video on Facebook last year, with more than 360 million views. His social media following totals more than 32 million, he has produced more than 400 viral videos which have amassed more than five billion views, and his podcast, On Purpose, is consistently ranked the world’s number one health-related podcast.\xa0\xa0'

In [None]:
features_names = vectorizer.get_feature_names_out()
top_keywords=[]
for row in tfidf:
  row_array = row.toarray().flatten()
  top_indices = row_array.argsort()[::-1][:10]
  keywords = [features_names[i] for i in top_indices if row_array[i]>0]
  top_keywords.append(keywords)

In [None]:
datas['Top_Keywords'] = top_keywords

In [None]:
datas.head()

Unnamed: 0.1,Unnamed: 0,Book_Name,Author,Rating,Number of Reviews,Price,Description,Listening Time (min),Genres,Top_Keywords
0,0,Think Like a Monk: The Secret of How to Harnes...,Jay Shetty,4.9,371.0,10080,"Over the past three years, Jay Shetty has beco...",654,"[Audible Audiobooks & Originals, Personal Succ...","[million, world, purpose, following, media, po..."
1,1,Ikigai: The Japanese Secret to a Long and Happ...,Héctor García,4.6,3682.0,615,Brought to you by Penguin.,203,"[Audible Audiobooks & Originals, Meditation, S...","[brought, penguin]"
2,2,The Subtle Art of Not Giving a F*ck: A Counter...,Mark Manson,4.4,20306.0,10378,"In this generation-defining self-help guide, a...",317,"[Audible Audiobooks & Originals, Personal Succ...","[positive, stop, generation, truly, self, bett..."
3,3,Atomic Habits: An Easy and Proven Way to Build...,James Clear,4.6,4678.0,888,Brought to you by Penguin.,335,"[Audible Audiobooks & Originals, Psychology, S...","[brought, penguin]"
4,4,Life's Amazing Secrets: How to Find Balance an...,Gaur Gopal Das,4.6,4308.0,1005,"Stop going through life, Start growing throug...",385,"[Audible Audiobooks & Originals, Literary Essa...","[life, growing, stop, going, start]"


In [None]:
datas['Sentiment_Polarity'] = datas['Description'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [None]:
count_vectorizer = CountVectorizer(stop_words='english',max_df = 0.9,min_df=10)
count_matrix = count_vectorizer.fit_transform(datas['Description'])

In [None]:
lda = LatentDirichletAllocation(n_components=5,random_state=42)
lda.fit(count_matrix)

In [None]:
lda_topic_distribution = lda.transform(count_matrix)
datas['LDA_Topic'] = lda_topic_distribution.argmax(axis=1).astype(str)

In [None]:
datas['Text_Features'] = (
    datas['Genres'].apply(lambda x: ' '.join(x)) + ' '+
    datas['Top_Keywords'].apply(lambda x: ' '.join(x)) + ' ' +
    datas['LDA_Topic'] + ' ' +
    datas['Sentiment_Polarity'].astype(str)
)


In [None]:
datas['Text_Features'][0]

'Audible Audiobooks & Originals Personal Success Stress Management Society & Culture million world purpose following media popular health social past number 2 0.3027777777777778'

In [None]:
final_vectorizer = TfidfVectorizer()
feature_matrix = final_vectorizer.fit_transform(datas['Text_Features'])

In [None]:
similarity_matrix = cosine_similarity(feature_matrix)
similarity_matrix

array([[1.        , 0.11352337, 0.06115949, ..., 0.03914503, 0.        ,
        0.17042762],
       [0.11352337, 1.        , 0.2466334 , ..., 0.22993036, 0.        ,
        0.14085033],
       [0.06115949, 0.2466334 , 1.        , ..., 0.09851227, 0.        ,
        0.06647739],
       ...,
       [0.03914503, 0.22993036, 0.09851227, ..., 1.        , 0.        ,
        0.06818619],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.17042762, 0.14085033, 0.06647739, ..., 0.06818619, 0.        ,
        1.        ]])

In [None]:
def recommend_by_genre(genre_name,top_n=10,base_title=None):
  genre_books = datas[datas['Genres'].apply(lambda genres: genre_name in genres)]

  if genre_books.empty:
    return f"Not Found ' {genre_name}'."

  if base_title:
    if base_title not in genre_books['Book_Name'].values:
      return f"Book '{base_title} not found in the Given Genres'{genre_name}'."

    idx = genre_books[genre_books['Book_Name']==base_title].index[0]
    sim_score = list(enumerate(similarity_matrix[idx]))

    validate_indices = genre_books.index.tolist()
    sim_score = [(i,score) for i, score in similarity_matrix]
    sim_score = sorted(sim_score,key=lambda x: x[1],reverse=True)

    top_indicies = [i for i, _ in sim_score[1:top_n+1]]
    return datas.loc[top_indices][['Book_Name','Author','Rating']]

  else:
    return genre_books.sort_values(by='Rating',ascending=False).head(top_n)[
    ['Book_Name', 'Author', 'Rating']
        ]

In [None]:
print(recommend_by_genre('Contemporary Romance'))

                                              Book_Name            Author  \
2321        Dirty Rich One Night Stand: Two Years Later  Lisa Renee Jones   
3527                                   Love You Forever  Abbyshek Chandra   
1859                        Jax: Protectors Series # 8,   Teresa Gabelman   
3789   Midnight Alley: The Morganville Vampires, Book 3      Rachel Caine   
953                                            Still Me        Jojo Moyes   
1576                                              Royal    Danielle Steel   
2535                        Reveal: Wicked Ways, Book 2       K. Bromberg   
2488                                    Sharing a Shell      Julian Clary   
718                  Angry God: All Saints High, Book 3         L.J. Shen   
3855  It's OK That You're Not OK: Meeting Grief and ...      Megan Devine   

      Rating  
2321     4.8  
3527     4.8  
1859     4.8  
3789     4.7  
953      4.7  
1576     4.7  
2535     4.7  
2488     4.7  
718      4.7  
38

In [None]:
datas['Genres'].value_counts()

Unnamed: 0_level_0,count
Genres,Unnamed: 1_level_1
[Personal Success],1169
"[Science Fiction Anthologies & Short Stories, Fiction Short Stories]",629
[Classic Literature],31
"[Leadership, Business Careers]",25
[Contemporary Romance],21
...,...
"[Women Sleuth Mysteries, International Mystery & Crime, Crime Thrillers]",1
"[Audible Audiobooks & Originals, English Language Learning, Language Learning & Teaching]",1
"[Creativity & Genius, Self-Esteem, Personal Success]",1
"[Classic Action & Adventure, Classic Literature]",1
