In [None]:
import numpy as np
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from transformers.pipelines import pipeline
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_books = pd.read_csv('/Users/daniel/Desktop/AI/Book_Recom/Books_Recommender_Model/Books_FINISH_04-05-25.csv')

In [4]:
df_books = df_books.rename(columns = {'Number_Description' : 'tagged_description'})

## Разработка языковой модели

In [5]:
df_books['tagged_description'].to_csv("tagged_description.txt",
                                      sep = "\n",
                                      index = False,
                                      header =False)

In [19]:
raw_doc = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size = 0, chunk_overlap = 0, separator = "\n")
document = text_splitter.split_documents(raw_doc)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [20]:
document[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='43490 Collects poems written by the eleven-year-old muscular dystrophy patient, sharing his feelings and thoughts about his life, the deaths of his siblings, nature, faith, and hope.')

### Эмбеддинг и создание базы данных векторов

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

db_books = Chroma.from_documents( # База данных векоров
    document,
    embeddings,
    persist_directory = "/content/drive/MyDrive/Colab Notebooks/books_recom_model/artifacts"
)

  embeddings = HuggingFaceEmbeddings(
Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [22]:
query = "An adventure novel"
docs = db_books.similarity_search(query, k = 15)
docs

[Document(id='e66e1976-9212-4e6e-841c-7d5d21c376de', metadata={'source': 'tagged_description.txt'}, page_content="17141 #1 New York Times bestselling author Dean Koontz delivers a thrilling novel of suspense and adventure, as the lives of strangers converge around a mystery unfolding high in the Colorado mountains—and the balance of the world begins to tilt….\xa0\xa0In the stillness of a golden September afternoon, deep in the wilderness of the Rockies, a solitary craftsman, Grady Adams, and his magnificent Irish wolfhound Merlin step from shadow into light…and into an encounter with enchantment. That night, through the trees, under the moon, a pair of singular animals will watch Grady's isolated home, waiting to make their approach. \xa0A few miles away, Camillia Rivers, a local veterinarian, begins to unravel the threads of a puzzle that will bring all the forces of a government in peril to her door.\xa0At a nearby farm, long-estranged identical twins come together to begin a descent

Путем фильтрации isbn13 получаем все необходимые сведения о рекомендованной литературе


In [23]:
df_books[df_books["Number"] == int(docs[10].page_content.split()[0].strip())]

Unnamed: 0,Title,Authors,Description,Rating,Category,Link,Number,tagged_description
32024,Peter and the Starcatchers (Peter and the Star...,"By Barry, Dave, Pearson, Ridley, and Call, Gre...",Don'teven think of starting this bookunless yo...,4.76,Fiction,https://raw.github.com/Kosty1703/picture-book/...,40448,40448 Don'teven think of starting this bookunl...


In [None]:
def retrieve_semantic_recommendations(query: str, top_k: int = 10) -> pd.DataFrame:
  recs = db_books.similarity_search(query, k = 50) # Получение рекомендаций

  books_list = []

  for i in range(0, len(recs)):
    books_list += [int(recs[i].page_content.strip('"').split()[0])] # Для каждой рекомендации производится изъятие уникального идентификатора                                                              
  return df_books[df_books["Number"].isin(books_list)] # Фильтрация данных на основе идентификатора с получением сведений о книге

In [25]:
recommendations = retrieve_semantic_recommendations("A book about space exploration, astrophysics and astronomy")

In [26]:
recommendations['Description']

Unnamed: 0,Description
1131,"During the last forty years, human beings have..."
2731,Exploring everything from the moon to distant ...
3948,From the Random House Library of Knowledge com...
4897,Offers a brief history of stargazing and astro...
6410,From the second-century celestial models of Pt...
6461,An introductory illustrated and comprehensive ...
7328,Exploring the Night Sky is aimed at novice sta...
8165,Introduces astrology and the different kinds o...
8716,Foreword by Charles OsgoodIntroduction by Char...
8896,"From the ancients who charted the stars, to Ju..."


## Анализ тональности текста по описанию

In [63]:
sentiment_class = pipeline("text-classification", model = "j-hartmann/emotion-english-distilroberta-base", top_k = None, truncation = True)
sentiment_class("I hate you")

Device set to use cuda:0


[[{'label': 'anger', 'score': 0.6728215217590332},
  {'label': 'disgust', 'score': 0.16644880175590515},
  {'label': 'sadness', 'score': 0.10849989205598831},
  {'label': 'neutral', 'score': 0.037466250360012054},
  {'label': 'fear', 'score': 0.007570746820420027},
  {'label': 'surprise', 'score': 0.004503953270614147},
  {'label': 'joy', 'score': 0.0026888290885835886}]]

In [64]:
books = df_books.copy()

In [65]:
books["Description"][0]

'Collects poems written by the eleven-year-old muscular dystrophy patient, sharing his feelings and thoughts about his life, the deaths of his siblings, nature, faith, and hope.'

In [66]:
sentiment_class(books["Description"][0])

[[{'label': 'sadness', 'score': 0.4991002082824707},
  {'label': 'neutral', 'score': 0.39802300930023193},
  {'label': 'joy', 'score': 0.06579737365245819},
  {'label': 'disgust', 'score': 0.021078508347272873},
  {'label': 'surprise', 'score': 0.007271352224051952},
  {'label': 'anger', 'score': 0.004705924075096846},
  {'label': 'fear', 'score': 0.004023648798465729}]]

In [67]:
sentiment_class(books["Description"][0].split("."))

[[{'label': 'sadness', 'score': 0.5352131128311157},
  {'label': 'neutral', 'score': 0.31357187032699585},
  {'label': 'joy', 'score': 0.1220925971865654},
  {'label': 'disgust', 'score': 0.011858065612614155},
  {'label': 'surprise', 'score': 0.008605828508734703},
  {'label': 'anger', 'score': 0.004562865011394024},
  {'label': 'fear', 'score': 0.004095643293112516}],
 [{'label': 'neutral', 'score': 0.5494765639305115},
  {'label': 'sadness', 'score': 0.1116902083158493},
  {'label': 'disgust', 'score': 0.10400670021772385},
  {'label': 'surprise', 'score': 0.07876556366682053},
  {'label': 'anger', 'score': 0.0641336739063263},
  {'label': 'fear', 'score': 0.05136282742023468},
  {'label': 'joy', 'score': 0.040564440190792084}]]

In [68]:
sentences = books["Description"][0].split(".")
predictions = sentiment_class(sentences)

In [69]:
sorted(predictions[0], key = lambda x: x["label"])

[{'label': 'anger', 'score': 0.004562865011394024},
 {'label': 'disgust', 'score': 0.011858065612614155},
 {'label': 'fear', 'score': 0.004095643293112516},
 {'label': 'joy', 'score': 0.1220925971865654},
 {'label': 'neutral', 'score': 0.31357187032699585},
 {'label': 'sadness', 'score': 0.5352131128311157},
 {'label': 'surprise', 'score': 0.008605828508734703}]

In [70]:
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []

emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
  per_emotion_scores = {label: [] for label in emotion_labels}
  for prediction in predictions:
    sorted_predictions = sorted(prediction, key=lambda x: x["label"])
    for index, label in enumerate(emotion_labels):
      per_emotion_scores[label].append(sorted_predictions[index]["score"])
  return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [71]:
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books["Number"][i])
    sentences = books["Description"][i].split(".")
    predictions = sentiment_class(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 62814/62814 [44:11<00:00, 23.69it/s]


In [72]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["Number"] = isbn
emotions_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,Number
0,0.064134,0.104007,0.051363,0.122093,0.549477,0.535213,0.078766,43490
1,0.162909,0.116865,0.037274,0.595338,0.631946,0.122385,0.131094,41338
2,0.064134,0.104007,0.051363,0.790234,0.835526,0.111690,0.333908,42451
3,0.420488,0.941071,0.987298,0.248206,0.906683,0.519223,0.285749,33170
4,0.349207,0.510218,0.051363,0.040564,0.549477,0.111690,0.078766,19220
...,...,...,...,...,...,...,...,...
62809,0.148208,0.030643,0.919165,0.255170,0.853722,0.980877,0.030656,1038
62810,0.064134,0.114383,0.051363,0.400263,0.883199,0.111690,0.227765,3568
62811,0.009997,0.009929,0.339218,0.947779,0.375755,0.066685,0.057625,1975
62812,0.064134,0.104007,0.459269,0.759455,0.951104,0.368111,0.078766,3185


In [73]:
books = pd.merge(books, emotions_df, on = "Number")
books

Unnamed: 0,Title,Authors,Description,Rating,Category,Link,Number,tagged_description,anger,disgust,fear,joy,sadness,surprise,neutral
0,Journey Through Heartsongs,"By Stepanek, Mattie J. T.",Collects poems written by the eleven-year-old ...,4.23,Poetry,https://raw.github.com/Kosty1703/picture-book/...,43490,43490 Collects poems written by the eleven-yea...,0.064134,0.104007,0.051363,0.122093,0.549477,0.535213,0.078766
1,In Search of Melancholy Baby,"By Aksyonov, Vassily, Heim, Michael Henry, and...",The Russian author offers an affectionate chro...,3.98,Biography,https://raw.github.com/Kosty1703/picture-book/...,41338,41338 The Russian author offers an affectionat...,0.162909,0.116865,0.037274,0.595338,0.631946,0.122385,0.131094
2,The Dieter's Guide to Weight Loss During Sex,"By Smith, Richard","A humor classic, this tongue-in-cheek diet pla...",4.68,Self-help,https://raw.github.com/Kosty1703/picture-book/...,42451,"42451 A humor classic, this tongue-in-cheek di...",0.064134,0.104007,0.051363,0.790234,0.835526,0.111690,0.333908
3,Germs : Biological Weapons and America's Secre...,"By Miller, Judith, Engelberg, Stephen, and Bro...","Deadly germs sprayed in shopping malls, bomb-l...",4.84,Nonfiction,https://raw.github.com/Kosty1703/picture-book/...,33170,"33170 Deadly germs sprayed in shopping malls, ...",0.420488,0.941071,0.987298,0.248206,0.906683,0.519223,0.285749
4,The Good Book: Reading the Bible with Mind and...,"By Gomes, Peter J.","""The Bible and the social and moral consequenc...",3.79,Religion,https://raw.github.com/Kosty1703/picture-book/...,19220,"19220 ""The Bible and the social and moral cons...",0.349207,0.510218,0.051363,0.040564,0.549477,0.111690,0.078766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62809,Mistaken Identity,Nayantara Sahgal,On A Train Journey Home To North India After L...,2.93,Fiction,http://books.google.com/books/content?id=q-tKP...,1038,1038 On A Train Journey Home To North India Af...,0.148208,0.030643,0.919165,0.255170,0.853722,0.980877,0.030656
62810,Journey to the East,Hermann Hesse,This book tells the tale of a man who goes on ...,3.70,Adventure,http://books.google.com/books/content?id=rq6JP...,3568,3568 This book tells the tale of a man who goe...,0.064134,0.114383,0.051363,0.400263,0.883199,0.111690,0.227765
62811,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,"Wisdom to Create a Life of Passion, Purpose, a...",3.82,Self-help,http://books.google.com/books/content?id=c_7mf...,1975,"1975 Wisdom to Create a Life of Passion, Purpo...",0.009997,0.009929,0.339218,0.947779,0.375755,0.066685,0.057625
62812,I Am that. Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,This collection of the timeless teachings of o...,4.51,Nonfiction,http://books.google.com/books/content?id=Fv_JP...,3185,3185 This collection of the timeless teachings...,0.064134,0.104007,0.459269,0.759455,0.951104,0.368111,0.078766
