In [13]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import pipeline

In [15]:
books = pd.read_csv('books_simple_categorised.csv')

In [16]:

classifier = pipeline("text-classification",
                       model="j-hartmann/emotion-english-distilroberta-base", 
                       top_k=None)
classifier("I love this!")


Device set to use cpu


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528684265911579},
  {'label': 'neutral', 'score': 0.005764583125710487},
  {'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'disgust', 'score': 0.0016119909705594182},
  {'label': 'fear', 'score': 0.00041385277290828526}]]

In [17]:
classifier(books['description'][0].split('.'))

[[{'label': 'surprise', 'score': 0.7296020984649658},
  {'label': 'neutral', 'score': 0.14038600027561188},
  {'label': 'fear', 'score': 0.06816228479146957},
  {'label': 'joy', 'score': 0.04794260859489441},
  {'label': 'anger', 'score': 0.009156366810202599},
  {'label': 'disgust', 'score': 0.0026284765917807817},
  {'label': 'sadness', 'score': 0.002122163539752364}],
 [{'label': 'neutral', 'score': 0.44937002658843994},
  {'label': 'disgust', 'score': 0.27359163761138916},
  {'label': 'joy', 'score': 0.10908330976963043},
  {'label': 'sadness', 'score': 0.09362746775150299},
  {'label': 'anger', 'score': 0.04047830402851105},
  {'label': 'surprise', 'score': 0.026970159262418747},
  {'label': 'fear', 'score': 0.006879047024995089}],
 [{'label': 'neutral', 'score': 0.6462159752845764},
  {'label': 'sadness', 'score': 0.24273329973220825},
  {'label': 'disgust', 'score': 0.04342271760106087},
  {'label': 'surprise', 'score': 0.028300564736127853},
  {'label': 'joy', 'score': 0.014211

In [18]:
emotion_labels = [ 'anger', 'disgust', 'fear', 'joy','sadness', 'surprise', 'neutral']
isbn= []
emotion_scores = {label: [] for label in emotion_labels}

def claculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions= sorted(prediction, key=lambda x: x['label'])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]['score'])

    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}



In [19]:
for i in range(10):
    isbn.append(books['isbn13'][i])
    sentences = books['description'][i].split('.')
    predictions = classifier(sentences)
    max_scores= claculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])


In [20]:
emotion_scores

{'anger': [np.float64(0.0641336441040039),
  np.float64(0.6126197576522827),
  np.float64(0.0641336441040039),
  np.float64(0.35148438811302185),
  np.float64(0.08141235262155533),
  np.float64(0.2322252243757248),
  np.float64(0.5381842255592346),
  np.float64(0.0641336441040039),
  np.float64(0.3006700277328491),
  np.float64(0.0641336441040039)],
 'disgust': [np.float64(0.27359163761138916),
  np.float64(0.3482847511768341),
  np.float64(0.10400667786598206),
  np.float64(0.1507224589586258),
  np.float64(0.18449543416500092),
  np.float64(0.7271744608879089),
  np.float64(0.155854731798172),
  np.float64(0.10400667786598206),
  np.float64(0.2794816195964813),
  np.float64(0.17792661488056183)],
 'fear': [np.float64(0.9281681180000305),
  np.float64(0.9425276517868042),
  np.float64(0.9723208546638489),
  np.float64(0.3607059419155121),
  np.float64(0.09504333138465881),
  np.float64(0.05136283114552498),
  np.float64(0.7474274635314941),
  np.float64(0.4044976532459259),
  np.float

In [21]:
emotion_labels = [ 'anger', 'disgust', 'fear', 'joy','sadness', 'surprise', 'neutral']
isbn= []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books['isbn13'][i])
    sentences = books['description'][i].split('.')
    predictions = classifier(sentences)
    max_scores= claculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])


100%|██████████| 5197/5197 [14:31<00:00,  5.96it/s]


In [22]:
emotions_df= pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn

In [23]:
books= pd.merge(books, emotions_df, on='isbn13')

In [24]:
books.to_csv('books_with_emotions.csv', index=False)

In [27]:
books= pd.read_csv('books_with_emotions.csv')
books['simple categories']

0          Fiction
1          Fiction
2          Fiction
3       Nonfiction
4       Nonfiction
           ...    
5192       Fiction
5193    Nonfiction
5194       Fiction
5195    Nonfiction
5196    Nonfiction
Name: simple categories, Length: 5197, dtype: object

In [29]:
sorted(books['simple categories'].unique())

["Children's Fiction", "Children's Nonfiction", 'Fiction', 'Nonfiction']