In [1]:
import json

with open("../scripts/all_books_data.json", "r", encoding="utf-8") as f:
    books = json.load(f)

len(books), books[0].keys()

(65224,
 dict_keys(['id', 'title', 'authors', 'shortDescription', 'longDescription', 'category', 'publisher', 'releaseDate', 'pages', 'isbn', 'rating', 'ratingsCount', 'coverImage', 'url', 'reviews']))

In [2]:
import os
os.path.getsize("../scripts/all_books_data.json") / (1024*1024)

263.0325622558594

In [3]:
import pandas as pd

books_df = pd.json_normalize(books, max_level=1)
books_df.head()

Unnamed: 0,id,title,authors,shortDescription,longDescription,category,publisher,releaseDate,pages,isbn,rating,ratingsCount,coverImage,url,reviews
0,205546,0.4,"[{'id': '95271', 'name': 'Mike Lancaster', 'ur...",It’s a brave new world. 'My name is Kyle Strak...,It’s a brave new world. 'My name is Kyle Strak...,"fantasy, science fiction",Egmont UK,2011-01-03,304.0,9781405253048,6.33,3,https://s.lubimyczytac.pl/upload/books/205000/...,https://lubimyczytac.pl/ksiazka/205546/0-4,"[{'author': 'sinnerinc', 'rating': 5, 'text': ..."
1,212894,1222.0,"[{'id': '14014', 'name': 'Anne Holt', 'url': '...",,,"kryminał, sensacja, thriller",Scribner,2012-08-07,,1451634722,5.17,6,https://s.lubimyczytac.pl/upload/default-book-...,https://lubimyczytac.pl/ksiazka/212894/1222,"[{'author': 'annamagdalena', 'rating': 7, 'tex..."
2,192320,1492.0,"[{'id': '14354', 'name': 'Jacques Attali', 'ur...",Rok 1492: pamięta się zwykle o trzech faktach ...,Rok 1492: pamięta się zwykle o trzech faktach ...,historia,Czytelnik,1992-01-01,,8307023076,6.71,7,https://s.lubimyczytac.pl/upload/books/192000/...,https://lubimyczytac.pl/ksiazka/192320/1492,[]
3,232640,1602.0,"[{'id': '45143', 'name': 'Andy Kubert, Richard...",Jest rok 1602. Era elżbietańska zbliża się ku ...,Jest rok 1602. Era elżbietańska zbliża się ku ...,komiksy,Hachette Polska,2014-08-27,224.0,9788377397954,7.06,433,https://s.lubimyczytac.pl/upload/books/232000/...,https://lubimyczytac.pl/ksiazka/232640/1602,"[{'author': 'Tomek G', 'rating': 6, 'text': 'B..."
4,218604,1914.0,"[{'id': '15993', 'name': 'Jean Echenoz', 'url'...","Rok 1914. Rozpoczyna się wojna, która dopiero ...","Rok 1914. Rozpoczyna się wojna, która dopiero ...",literatura piękna,Noir sur Blanc,2014-05-30,78.0,9788373924758,6.65,40,https://s.lubimyczytac.pl/upload/books/218000/...,https://lubimyczytac.pl/ksiazka/218604/1914,"[{'author': 'Elli', 'rating': 10, 'text': 'Po ..."


In [4]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65224 entries, 0 to 65223
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                65224 non-null  object 
 1   title             65224 non-null  object 
 2   authors           65224 non-null  object 
 3   shortDescription  65224 non-null  object 
 4   longDescription   57497 non-null  object 
 5   category          65224 non-null  object 
 6   publisher         63624 non-null  object 
 7   releaseDate       65186 non-null  object 
 8   pages             59048 non-null  float64
 9   isbn              65224 non-null  object 
 10  rating            65224 non-null  float64
 11  ratingsCount      65224 non-null  int64  
 12  coverImage        65224 non-null  object 
 13  url               65224 non-null  object 
 14  reviews           65224 non-null  object 
dtypes: float64(2), int64(1), object(12)
memory usage: 7.5+ MB


In [5]:
reviews_rows = []

for book in books:
    book_id = book["id"]
    book_title = book["title"]

    for review in book.get("reviews", []):
        reviews_rows.append({
            "book_id": book_id,
            "book_title": book_title,
            "review_author": review["author"],
            "reviews_rating": review["rating"],
            "review_text": review["text"],
            "review_date": review["date"],
        })
reviews_df = pd.DataFrame(reviews_rows)
reviews_df.head()


Unnamed: 0,book_id,book_title,review_author,reviews_rating,review_text,review_date
0,205546,0.4,sinnerinc,5.0,Ze względu na temat dosyć się nakręciłem na tą...,09.06.2015
1,212894,1222.0,annamagdalena,7.0,Katastrofa kolejowa. Szczęśliwie wszyscy pasaż...,03.10.2016
2,212894,1222.0,artdeco,5.0,"Nie jestem pod dużym wrażeniem ani śniegu, ani...",08.06.2014
3,232640,1602.0,Tomek G,6.0,Bardzo spodobała mi się idea umieszczenia akcj...,11.06.2025
4,232640,1602.0,darkakadarka,6.0,Lubię świat Marvela – jestem raczej na bieżąco...,29.10.2024


In [6]:
len(reviews_df)

182576

In [8]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel, MarianMTModel, MarianTokenizer
from huggingface_hub import hf_hub_download
import json
import numpy as np

pl_en_model_name = 'Helsinki-NLP/opus-mt-pl-en'
pl_en_tokenizer = MarianTokenizer.from_pretrained(pl_en_model_name)
pl_en_model = MarianMTModel.from_pretrained(pl_en_model_name)

def translate_pl_to_en(text, batch_size=16):
    results = []
    for i in range(0, len(text), batch_size):
        batch = text[i:i+batch_size]
        inputs = pl_en_tokenizer(batch, return_tensors="pt", truncation=True, padding=True)
        translated = pl_en_model.generate(**inputs)
        results.extend(pl_en_tokenizer.batch_decode(translated, skip_special_tokens=True))
    return results

# Step 1: Define the model architecture
class RobertaForMultiLabelClassification(nn.Module):
    def __init__(self, model_name, num_labels, dropout_rate=0.3, use_mean_pooling=True):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.use_mean_pooling = use_mean_pooling
        hidden_size = self.roberta.config.hidden_size
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_size // 2, num_labels)
    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        if self.use_mean_pooling:
            pooled_output = self.mean_pooling(outputs.last_hidden_state, attention_mask)
        else:
            pooled_output = outputs.pooler_output
        x = self.dropout1(pooled_output)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout2(x)
        logits = self.fc2(x)
        return logits
# Step 2: Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "Lakssssshya/roberta-large-goemotions"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
# Load config
config_path = hf_hub_download(repo_id=model_name, filename="config.json")
with open(config_path, 'r') as f:
    config = json.load(f)
model = RobertaForMultiLabelClassification(
    model_name='roberta-large',
    num_labels=config['num_labels'],
    dropout_rate=config.get('dropout_rate', 0.3),
    use_mean_pooling=config.get('use_mean_pooling', True)
)
# Load weights
weights_path = hf_hub_download(repo_id=model_name, filename="pytorch_model.bin")
state_dict = torch.load(weights_path, map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()
# Load thresholds
thresholds_path = hf_hub_download(repo_id=model_name, filename="optimal_thresholds.json")
with open(thresholds_path, 'r') as f:
    thresholds = np.array(json.load(f))
# Emotion labels
emotion_labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 
    'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
    'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
    'gratitude', 'grief', 'joy', 'love', 'nervousness',
    'optimism', 'pride', 'realization', 'relief', 'remorse',
    'sadness', 'surprise', 'neutral'
]

model.to(device)
model.eval()

eng2pl = {
    "admiration": "podziw",
    "amusement": "rozbawienie",
    "anger": "gniew",
    "annoyance": "irytacja",
    "approval": "zatwierdzenie",
    "caring": "troska",
    "confusion": "zmieszanie",
    "curiosity": "ciekawość",
    "desire": "pragnienie",
    "disappointment": "rozczarowanie",
    "disapproval": "dezaprobata",
    "disgust": "wstręt",
    "embarrassment": "zakłopotanie",
    "excitement": "ekscytacja",
    "fear": "strach",
    "gratitude": "wdzięczność",
    "grief": "żal",
    "joy": "radość",
    "love": "miłość",
    "nervousness": "nerwowość",
    "optimism": "optymizm",
    "pride": "duma",
    "realization": "uzmysłowienie",
    "relief": "ulga",
    "remorse": "wyrzuty sumienia",
    "sadness": "smutek",
    "surprise": "zaskoczenie",
    "neutral": "neutralne"
}

# Step 3: Predict
def predict_emotions(text, batch_size=16):
    all_predictions = []
    for i in range(0, len(text), batch_size):
        batch = text[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            logits = model(**inputs)
            probabilities = torch.sigmoid(logits).cpu().numpy()

        batch_predictions = []
        for row in range(len(batch)):
            row_probs = probabilities[row]
            predictions = (row_probs >= thresholds).astype(int)
            emotions_en = [emotion_labels[i] for i, pred in enumerate(predictions) if pred == 1]
            emotions_pl = [eng2pl[emotion] for emotion in emotions_en]
            batch_predictions.append(emotions_pl)
        all_predictions.extend(batch_predictions)
    return all_predictions

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import json
import time
from datetime import timedelta
from collections import Counter

def process_books_with_emotions(books, output_file="emotions_books.json", 
                                  checkpoint_file="checkpoint.json",
                                  save_every=200, start_from=0,
                                  translation_batch=64, emotion_batch=128):
    
    processed_books = []
    total_books = len(books)
    start_time = time.time()
    local_counter = 0
    
    # Wczytaj poprzedni postęp jeśli istnieje
    if start_from > 0:
        try:
            with open(output_file, "r", encoding="utf-8") as f:
                processed_books = json.load(f)
            print(f"Wznowiono od książki {start_from}, załadowano {len(processed_books)} książek")
        except FileNotFoundError:
            print("Nie znaleziono poprzedniego pliku, zaczynam od nowa")
            start_from = 0
    
    for i, book in enumerate(books[start_from:], start=start_from + 1):
        local_counter += 1

        elapsed = time.time() - start_time
        avg_time = elapsed / (i - start_from) if i > start_from else 0
        remaining = avg_time * (total_books - i) if i > start_from else 0
        eta_text = str(timedelta(seconds=int(remaining))) if i > start_from else "calculating..."
        percent = (i / total_books) * 100

        reviews = [r.get("text", "").strip() for r in book.get("reviews", []) if r.get("text", "").strip()]

        print(f"[{i}/{total_books}] ({percent:.2f}%) ETA: {eta_text} | "
            f"Processing: {book.get('title', 'Unknown')[:50]} with {len(reviews)} reviews")

        if not reviews:
            # Książka bez recenzji
            book_copy = { **book, "dominant_emotion": ["neutral"] }
            processed_books.append(book_copy)
            print(f"  Dominant emotions: ['neutral'] (brak recenzji)")
        else:
            try:
                reviews_en = translate_pl_to_en(reviews, batch_size=translation_batch)
                emotions_list = predict_emotions(reviews_en, batch_size=emotion_batch)

                all_emotions = []
                for original_review, emotions in zip(book["reviews"], emotions_list):
                    original_review["emotions"] = emotions
                    all_emotions.extend(emotions)

                dominant_emotions = [e for e, _ in Counter(all_emotions).most_common(5)] if all_emotions else ["neutral"]
                book_copy = { **book, "dominant_emotion": dominant_emotions }
                processed_books.append(book_copy)
                print(f"  Dominant emotions: {dominant_emotions}")

            except Exception as e:
                book_copy = { **book, "dominant_emotion": ["neutral"], "error": str(e) }
                processed_books.append(book_copy)
                print(f"  ERROR processing reviews: {e}")

        # Zapis co save_every książek lub na końcu
        if local_counter % save_every == 0 or i == total_books:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(processed_books, f, ensure_ascii=False, indent=2)
            with open(checkpoint_file, "w", encoding="utf-8") as f:
                json.dump({"last_processed": i}, f)
            print(f"  ✓ Saved progress: {len(processed_books)} books to {output_file}")

    
    # Finalne zapisanie
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(processed_books, f, ensure_ascii=False, indent=2)
    
    print(f"\n{'='*60}")
    print(f"COMPLETED! Processed {len(processed_books)} books")
    print(f"Total time: {timedelta(seconds=int(time.time() - start_time))}")
    print(f"Output saved to: {output_file}")
    print(f"{'='*60}")
    
    return processed_books


try: 
    with open("checkpoint.json", "r", encoding="utf-8") as f:
        checkpoint = json.load(f)
        start_from = checkpoint.get("last_processed", 0)
        print(f"Wznowiono od książki {start_from}")
except FileNotFoundError:
    start_from = 0
    print("Nie znaleziono checkpointu, zaczynam od nowa")

processed_books = process_books_with_emotions(
    books, 
    output_file="emotions_books.json",
    checkpoint_file="checkpoint.json",
    start_from=start_from,
    save_every=50,
    translation_batch=64,
    emotion_batch=128 
)


Wznowiono od książki 1200
Wznowiono od książki 1200, załadowano 1200 książek
[1201/65224] (1.84%) ETA: 1:04:52 | Processing: A Novel. Poland with 0 reviews
  Dominant emotions: ['neutral'] (brak recenzji)
[1202/65224] (1.84%) ETA: 0:32:26 | Processing: A Numbers Game with 0 reviews
  Dominant emotions: ['neutral'] (brak recenzji)
[1203/65224] (1.84%) ETA: 0:21:37 | Processing: À Paris. Deuxième partie with 0 reviews
  Dominant emotions: ['neutral'] (brak recenzji)
[1204/65224] (1.85%) ETA: 0:16:13 | Processing: A Perfect Blood with 2 reviews


In [19]:
import json

with open("emotions_books.json", "r", encoding="utf-8") as f:
    emotions_books = json.load(f)
len(emotions_books), emotions_books[0].keys()

(1000,
 dict_keys(['id', 'title', 'authors', 'shortDescription', 'longDescription', 'category', 'publisher', 'releaseDate', 'pages', 'isbn', 'rating', 'ratingsCount', 'coverImage', 'url', 'reviews', 'dominant_emotion']))

In [20]:
from collections import Counter

emotion_counter = Counter()

for book in emotions_books:
    emotions = book.get("dominant_emotion", [])
    for e in emotions:
        emotion_counter[e] += 1

emotion_counter

Counter({'neutral': 420,
         'neutralne': 388,
         'podziw': 325,
         'zatwierdzenie': 267,
         'dezaprobata': 108,
         'rozczarowanie': 80,
         'ciekawość': 78,
         'miłość': 48,
         'zmieszanie': 40,
         'wdzięczność': 31,
         'zaskoczenie': 30,
         'radość': 30,
         'rozbawienie': 25,
         'irytacja': 23,
         'uzmysłowienie': 15,
         'smutek': 14,
         'strach': 12,
         'ekscytacja': 10,
         'pragnienie': 4,
         'zakłopotanie': 4,
         'wyrzuty sumienia': 3,
         'optymizm': 3,
         'troska': 3,
         'wstręt': 2,
         'ulga': 2,
         'duma': 1,
         'nerwowość': 1})