# Version -01 

In [148]:
import pandas as pd
import numpy as np
import requests

import os
import ast # to convert string into list

from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches
import joblib

from sentence_transformers import SentenceTransformer

### # Fetch data from your backend

In [149]:
# takes a list of nested JSON objects (like your books_data) and flattens them into a tabular DataFrame
# The sep='_' parameter ensures that nested fields get column names like field_subfield instead of keeping them as dictionaries

url = "http://localhost:8080/api/v1/product/books/all"
response = requests.get(url)
books_json = response.json() # parses the JSON response from the server into a Python dictionary.

In [150]:
# print(books_json)

### Extract the 'data' field

In [151]:
books_data = books_json['data']

### Flatten JSON, including nested objects

In [152]:
book_df = pd.json_normalize(books_data, sep='_')

In [153]:
book_df.head()

Unnamed: 0,id,title,author,publisher,category,genres,description,keywords,language,isbn
0,b911c087-5a19-4fc1-8bff-9d59c17ba370,The Fellowship of the Ring,J.R.R. Tolkien,Allen & Unwin,Novel,"[{'name': 'Fantasy', 'id': 1}, {'name': 'Epic'...","The first part of the epic trilogy, following ...","[frodo, ring, fellowship, quest, fantasy]",English,9780261103573
1,b0a44b63-a35f-4b10-b4fb-632ddeb39647,A Game of Thrones,George R.R. Martin,Bantam Books,Novel,"[{'name': 'Fantasy', 'id': 1}, {'name': 'Epic'...",Noble families vie for control of the Iron Thr...,"[throne, noble, families, fantasy, epic]",English,9780553103540
2,bb2c36dc-1116-454e-8224-b2e03655a7f6,The Name of the Wind,Patrick Rothfuss,DAW Books,Novel,"[{'name': 'Fantasy', 'id': 1}, {'name': 'Adven...",Kvothe recounts his journey from a gifted chil...,"[kvothe, magic, music, journey, adventure]",English,9780756404741
3,bb8770e2-dc3c-4416-a5b0-1da15efe9a0d,Dune,Frank Herbert,Chilton Books,Novel,"[{'name': 'Adventure', 'id': 3}, {'name': 'Sci...",The story of Paul Atreides and the desert plan...,"[paul, atreides, arrakis, desert, politics]",English,9780441172719
4,c237c746-6b13-4b60-bb0b-b259d37a18ef,Neuromancer,William Gibson,Ace Books,Novel,"[{'name': 'Science Fiction', 'id': 4}, {'name'...",A washed-up computer hacker is hired for one l...,"[hacker, ai, cyberspace, future, cyberpunk]",English,9780441569595


In [154]:
book_df.columns

Index(['id', 'title', 'author', 'publisher', 'category', 'genres',
       'description', 'keywords', 'language', 'isbn'],
      dtype='object')

In [155]:
book_df.iloc[0]['genres']

[{'name': 'Fantasy', 'id': 1}, {'name': 'Epic', 'id': 2}]

## Data cleaning

##### Convert all list columns into string representation for applying data cleaning functions

In [156]:
print(type(book_df.iloc[0]['genres']))
print(type(book_df.iloc[0]['keywords']))

<class 'list'>
<class 'list'>


In [157]:
for col in book_df:
    if book_df[col].apply(lambda x : isinstance(x, list)).any():
        book_df[col] = book_df[col].apply(lambda x : str(x))
book_df.head()

Unnamed: 0,id,title,author,publisher,category,genres,description,keywords,language,isbn
0,b911c087-5a19-4fc1-8bff-9d59c17ba370,The Fellowship of the Ring,J.R.R. Tolkien,Allen & Unwin,Novel,"[{'name': 'Fantasy', 'id': 1}, {'name': 'Epic'...","The first part of the epic trilogy, following ...","['frodo', 'ring', 'fellowship', 'quest', 'fant...",English,9780261103573
1,b0a44b63-a35f-4b10-b4fb-632ddeb39647,A Game of Thrones,George R.R. Martin,Bantam Books,Novel,"[{'name': 'Fantasy', 'id': 1}, {'name': 'Epic'...",Noble families vie for control of the Iron Thr...,"['throne', 'noble', 'families', 'fantasy', 'ep...",English,9780553103540
2,bb2c36dc-1116-454e-8224-b2e03655a7f6,The Name of the Wind,Patrick Rothfuss,DAW Books,Novel,"[{'name': 'Fantasy', 'id': 1}, {'name': 'Adven...",Kvothe recounts his journey from a gifted chil...,"['kvothe', 'magic', 'music', 'journey', 'adven...",English,9780756404741
3,bb8770e2-dc3c-4416-a5b0-1da15efe9a0d,Dune,Frank Herbert,Chilton Books,Novel,"[{'name': 'Adventure', 'id': 3}, {'name': 'Sci...",The story of Paul Atreides and the desert plan...,"['paul', 'atreides', 'arrakis', 'desert', 'pol...",English,9780441172719
4,c237c746-6b13-4b60-bb0b-b259d37a18ef,Neuromancer,William Gibson,Ace Books,Novel,"[{'name': 'Science Fiction', 'id': 4}, {'name'...",A washed-up computer hacker is hired for one l...,"['hacker', 'ai', 'cyberspace', 'future', 'cybe...",English,9780441569595


In [158]:
print(type(book_df.iloc[0]['genres']))
print(type(book_df.iloc[0]['keywords']))

<class 'str'>
<class 'str'>


In [159]:
book_df.duplicated(keep=False).sum()

np.int64(0)

In [160]:
book_df.isnull().sum()

id             0
title          0
author         0
publisher      0
category       0
genres         0
description    0
keywords       0
language       0
isbn           0
dtype: int64

In [161]:
book_df.iloc[0]

id                          b911c087-5a19-4fc1-8bff-9d59c17ba370
title                                 The Fellowship of the Ring
author                                            J.R.R. Tolkien
publisher                                          Allen & Unwin
category                                                   Novel
genres         [{'name': 'Fantasy', 'id': 1}, {'name': 'Epic'...
description    The first part of the epic trilogy, following ...
keywords       ['frodo', 'ring', 'fellowship', 'quest', 'fant...
language                                                 English
isbn                                               9780261103573
Name: 0, dtype: object

## Data Cleaning Functions

In [167]:
# convert string list into list for fetchign specific df, always pass a string
def genres_string_to_list_convertor(text, no_of_items=None):

    if isinstance(text, list) :
        return text
        
    # text is null return []
    if isinstance(text, float) and pd.isna(text):
        return []
        
    word_list = []
    count=1
    
    for i in ast.literal_eval(text):
        word_list.append(i['name'])
        if no_of_items is not None:
            if count >= no_of_items:
                break
            count += 1
            
    return word_list

# convert string list into list for fetchign specific df, always pass a string
def keywords_string_to_list_convertor(text, no_of_items=None):

    if isinstance(text, list) :
        return text
        
    # text is null return []
    if isinstance(text, float) and pd.isna(text):
        return []
        
    word_list = []
    count=1
    
    for i in ast.literal_eval(text):
        word_list.append(i)
        if no_of_items is not None:
            if count >= no_of_items:
                break
            count += 1
            
    return word_list

# Reomove spaces betweeen names, always pass a list of str
def remove_spaces(text):
    words = []
    
    for i in text:
        words.append(i.replace(" ", ""))
    return words

# convert words in text list into lower, pass a list of str
def lowercase_words(text):
    if isinstance(text, list):
        return text
    return [i.lower() for i in text]

# wraps it into a single-element list, pass str
def to_list(text):
    if isinstance(text, list):
        return text

    if isinstance(text, str):
        return [text]
    else:
        return []
        

### Convert into list, lowercasing, space removing

In [168]:
book_df['genres'] = book_df['genres'].apply(genres_string_to_list_convertor).apply(remove_spaces).apply(lowercase_words)
book_df['keywords'] = book_df['keywords'].apply(keywords_string_to_list_convertor).apply(remove_spaces).apply(lowercase_words)
book_df['description'] = book_df['description'].apply(lambda x : x.split() if isinstance(x, str) else x).apply(lowercase_words)
book_df['author'] = book_df['author'].apply(to_list).apply(remove_spaces).apply(lowercase_words)
book_df['publisher']  = book_df['publisher'].apply(to_list).apply(remove_spaces).apply(lowercase_words)
book_df['category']  = book_df['category'].apply(to_list).apply(remove_spaces).apply(lowercase_words)

book_df.head()

Unnamed: 0,id,title,author,publisher,category,genres,description,keywords,language,isbn
0,b911c087-5a19-4fc1-8bff-9d59c17ba370,The Fellowship of the Ring,[J.R.R.Tolkien],[Allen&Unwin],[Novel],"[Fantasy, Epic]","[The, first, part, of, the, epic, trilogy,, fo...","[frodo, ring, fellowship, quest, fantasy]",English,9780261103573
1,b0a44b63-a35f-4b10-b4fb-632ddeb39647,A Game of Thrones,[GeorgeR.R.Martin],[BantamBooks],[Novel],"[Fantasy, Epic]","[Noble, families, vie, for, control, of, the, ...","[throne, noble, families, fantasy, epic]",English,9780553103540
2,bb2c36dc-1116-454e-8224-b2e03655a7f6,The Name of the Wind,[PatrickRothfuss],[DAWBooks],[Novel],"[Fantasy, Adventure]","[Kvothe, recounts, his, journey, from, a, gift...","[kvothe, magic, music, journey, adventure]",English,9780756404741
3,bb8770e2-dc3c-4416-a5b0-1da15efe9a0d,Dune,[FrankHerbert],[ChiltonBooks],[Novel],"[Adventure, ScienceFiction]","[The, story, of, Paul, Atreides, and, the, des...","[paul, atreides, arrakis, desert, politics]",English,9780441172719
4,c237c746-6b13-4b60-bb0b-b259d37a18ef,Neuromancer,[WilliamGibson],[AceBooks],[Novel],"[ScienceFiction, Cyberpunk]","[A, washed-up, computer, hacker, is, hired, fo...","[hacker, ai, cyberspace, future, cyberpunk]",English,9780441569595


## Load embedding model

In [169]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define weights
weights = {
    'author': 1.0,
    'publisher': 0.5,
    'category': 3.0,
    'genres': 3.0,
    'description': 1.5,
    'keywords': 2.0
}

# Function to create weighted embeddings
def get_weighted_embedding(row, model, weights):
     # convert lists to strings
    parts = {
        'author': " ".join(row['author']),
        'publisher': " ".join(row['publisher']),
        'category': " ".join(row['category']),
        'genres': " ".join(row['genres']),
        'description': " ".join(row['description']),
        'keywords': " ".join(row['keywords'])
    }

    final_embedding = np.zeros(model.get_sentence_embedding_dimension())

    for key, text in parts.items():
        if text.strip(): # skip empty
            emb = model.encode(text)
            final_embedding += weights[key] * emb
    return final_embedding

## Compute weighted embeddings for all movies with a progress bar

In [170]:
from tqdm import tqdm

# empty list to hold embeddings
embedding_list = []

# Iterate over rows with a progress bar
for _, row  in tqdm(book_df.iterrows(), total=len(book_df), desc="Embedding books"):
    # Get the embedding for this row
    emb = get_weighted_embedding(row, model, weights)
    
    # Append to our list
    embedding_list.append(emb)
    
# Convert final list into a NumPy array
embeddings = np.array(embedding_list)  


Embedding books: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.79it/s]


In [171]:
embeddings.shape

(10, 384)

## Cosine similarity, Reccomendation function

In [172]:
similarity_matrix = cosine_similarity(embeddings)

In [173]:
def recommend(book, top_k=3):
    titles = book_df['title'].tolist()

    # Find the closest match (allowing typos / case differences)
    matches = get_close_matches(book, titles, n=1, cutoff=0.6)

    # Checks if the matches list is empty
    if not matches:
        print(f"No close match found for '{movie}'")
        return

    best_match = matches[0]
    # Finds the row number of the movie in new_df whose title exactly matches best_match
    index = book_df[book_df['title'] == best_match].index[0]

    distances = similarity_matrix[index]
    distance_pairs = list(enumerate(distances))
    distance_pairs = sorted(distance_pairs, key=lambda x: x[1], reverse=True)

    # ignore first value (1 is itself same book)
    results = []
    
    print(f"\nResults for: {best_match}\n")

    for i, score in distance_pairs[1: top_k+1]:
        book_title = book_df.iloc[i]['title']
        results.append((book_title, score)) # only accepts one argument
        print(f"{book_title} (Similarity: {score:.3f})")
    return results

In [180]:
recommended_books = recommend('A Game of Throes', 3)


Results for: A Game of Thrones

The Fellowship of the Ring (Similarity: 0.877)
The Name of the Wind (Similarity: 0.797)
Dune (Similarity: 0.669)


In [181]:
recommended_books

[('The Fellowship of the Ring', np.float64(0.8765903127999182)),
 ('The Name of the Wind', np.float64(0.7969221458440172)),
 ('Dune', np.float64(0.6686851352208568))]

In [183]:
os.makedirs('artifacts_v1', exist_ok=True)

joblib.dump(book_df, 'artifacts_v1/book_df.pkl')
joblib.dump(similarity_matrix, 'artifacts_v1/similarity_matrix.pkl')

['artifacts_v1/similarity_matrix.pkl']