In [1]:
import torch
import torchvision

print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))
print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)


CUDA available: True
GPU: NVIDIA GeForce RTX 2060
Torch version: 2.7.1+cu118
Torchvision version: 0.22.1+cu118


In [2]:
import pandas as pd

books = pd.read_csv('books_with_categories.csv')

In [3]:
from transformers import pipeline
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k = None,
                      device = 0)
classifier("I love this!")

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528691716492176},
  {'label': 'neutral', 'score': 0.0057645998895168304},
  {'label': 'anger', 'score': 0.004419785924255848},
  {'label': 'sadness', 'score': 0.0020923952106386423},
  {'label': 'disgust', 'score': 0.0016119939973577857},
  {'label': 'fear', 'score': 0.0004138521908316761}]]

Now based on the pipeline(sentiment classifier) we want to classify the book description so we can recommend books based on the mood of user

In [6]:
import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

In [7]:


def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

Test our function

In [8]:
for i in range(10):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [9]:
emotion_scores

{'anger': [0.06413372606039047,
  0.6126183867454529,
  0.06413372606039047,
  0.3514837622642517,
  0.08141253143548965,
  0.23222483694553375,
  0.538183867931366,
  0.06413372606039047,
  0.3006700277328491,
  0.06413372606039047],
 'disgust': [0.27359187602996826,
  0.3482852578163147,
  0.10400673002004623,
  0.15072230994701385,
  0.1844954490661621,
  0.7271748185157776,
  0.15585504472255707,
  0.10400673002004623,
  0.2794812023639679,
  0.17792722582817078],
 'fear': [0.928167998790741,
  0.942527711391449,
  0.9723208546638489,
  0.3607068955898285,
  0.09504333883523941,
  0.05136285722255707,
  0.7474281191825867,
  0.40449631214141846,
  0.9155242443084717,
  0.05136285722255707],
 'joy': [0.9327975511550903,
  0.7044219970703125,
  0.7672370672225952,
  0.2518812119960785,
  0.04056446626782417,
  0.0433758944272995,
  0.8725655674934387,
  0.04056446626782417,
  0.04056446626782417,
  0.04056446626782417],
 'sadness': [0.6462162733078003,
  0.887939453125,
  0.549476385

In [10]:
from tqdm import tqdm

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [03:13<00:00, 26.80it/s]


In [11]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["isbn13"] = isbn

In [12]:
emotions_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.064134,0.273592,0.928168,0.932798,0.646216,0.967158,0.729603,9780002005883
1,0.612618,0.348285,0.942528,0.704422,0.887939,0.111690,0.252545,9780002261982
2,0.064134,0.104007,0.972321,0.767237,0.549476,0.111690,0.078766,9780006178736
3,0.351484,0.150722,0.360707,0.251881,0.732685,0.111690,0.078766,9780006280897
4,0.081413,0.184495,0.095043,0.040564,0.884389,0.475881,0.078766,9780006280934
...,...,...,...,...,...,...,...,...
5192,0.148208,0.030643,0.919165,0.255171,0.853722,0.980877,0.030656,9788172235222
5193,0.064134,0.114383,0.051363,0.400263,0.883198,0.111690,0.227765,9788173031014
5194,0.009997,0.009929,0.339218,0.947779,0.375755,0.066685,0.057625,9788179921623
5195,0.064134,0.104007,0.459269,0.759456,0.951104,0.368111,0.078766,9788185300535


In [13]:
books = pd.merge(books, emotions_df, on = "isbn13")

In [14]:
books.to_csv("books_with_emotions.csv", index = False)