In [16]:
import pandas as pd

books = pd.read_csv('books_with_categories.csv')
print(books.head())

          isbn13      isbn10                title  \
0  9780002005883  0002005883               Gilead   
1  9780002261982  0002261987         Spider's Web   
2  9780006178736  0006178731       Rage of angels   
3  9780006280897  0006280897       The Four Loves   
4  9780006280934  0006280935  The Problem of Pain   

                           authors                     categories  \
0               Marilynne Robinson                        Fiction   
1  Charles Osborne;Agatha Christie  Detective and mystery stories   
2                   Sidney Sheldon                        Fiction   
3              Clive Staples Lewis                 Christian life   
4              Clive Staples Lewis                 Christian life   

                                           thumbnail  \
0  http://books.google.com/books/content?id=KQZCP...   
1  http://books.google.com/books/content?id=gA5GP...   
2  http://books.google.com/books/content?id=FKo2T...   
3  http://books.google.com/books/content?i

In [17]:
#using an emotion based model
from transformers import pipeline
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      return_all_scores=True,
                      top_k = None,
                        device = 0)
classifier("I love this!")

Device set to use cpu


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528684265911579},
  {'label': 'neutral', 'score': 0.005764586851000786},
  {'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'disgust', 'score': 0.0016119900392368436},
  {'label': 'fear', 'score': 0.0004138521908316761}]]

In [18]:
books['description'][0]

'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world ha

In [19]:
classifier(books['description'][0])

[[{'label': 'fear', 'score': 0.6548399925231934},
  {'label': 'neutral', 'score': 0.1698525995016098},
  {'label': 'sadness', 'score': 0.11640939861536026},
  {'label': 'surprise', 'score': 0.02070068009197712},
  {'label': 'disgust', 'score': 0.019100721925497055},
  {'label': 'joy', 'score': 0.015161462128162384},
  {'label': 'anger', 'score': 0.003935154061764479}]]

In [20]:
#but book has a mix of other emotions
#split the book down to individual sentences
classifier(books['description'][0].split('.'))

[[{'label': 'surprise', 'score': 0.729602038860321},
  {'label': 'neutral', 'score': 0.14038607478141785},
  {'label': 'fear', 'score': 0.06816229224205017},
  {'label': 'joy', 'score': 0.04794258251786232},
  {'label': 'anger', 'score': 0.009156374260783195},
  {'label': 'disgust', 'score': 0.002628477755934},
  {'label': 'sadness', 'score': 0.0021221644710749388}],
 [{'label': 'neutral', 'score': 0.44937077164649963},
  {'label': 'disgust', 'score': 0.27359139919281006},
  {'label': 'joy', 'score': 0.10908306390047073},
  {'label': 'sadness', 'score': 0.09362738579511642},
  {'label': 'anger', 'score': 0.040478240698575974},
  {'label': 'surprise', 'score': 0.02697017788887024},
  {'label': 'fear', 'score': 0.0068790484219789505}],
 [{'label': 'neutral', 'score': 0.6462157964706421},
  {'label': 'sadness', 'score': 0.24273352324962616},
  {'label': 'disgust', 'score': 0.04342266544699669},
  {'label': 'surprise', 'score': 0.028300534933805466},
  {'label': 'joy', 'score': 0.014211485

In [21]:
sentences = books['description'][0].split('.')
predictions = classifier(sentences)

In [22]:
predictions[0]

[{'label': 'surprise', 'score': 0.729602038860321},
 {'label': 'neutral', 'score': 0.14038607478141785},
 {'label': 'fear', 'score': 0.06816229224205017},
 {'label': 'joy', 'score': 0.04794258251786232},
 {'label': 'anger', 'score': 0.009156374260783195},
 {'label': 'disgust', 'score': 0.002628477755934},
 {'label': 'sadness', 'score': 0.0021221644710749388}]

In [23]:
predictions[3]

[{'label': 'fear', 'score': 0.9281681180000305},
 {'label': 'anger', 'score': 0.03219093754887581},
 {'label': 'neutral', 'score': 0.01280868798494339},
 {'label': 'sadness', 'score': 0.008756876923143864},
 {'label': 'surprise', 'score': 0.008597906678915024},
 {'label': 'disgust', 'score': 0.008431830443441868},
 {'label': 'joy', 'score': 0.0010455821175128222}]

In [24]:
#sort by score
#for each of the emotions we want the same exact order
sorted(predictions[0], key = lambda x:x['label'])

[{'label': 'anger', 'score': 0.009156374260783195},
 {'label': 'disgust', 'score': 0.002628477755934},
 {'label': 'fear', 'score': 0.06816229224205017},
 {'label': 'joy', 'score': 0.04794258251786232},
 {'label': 'neutral', 'score': 0.14038607478141785},
 {'label': 'sadness', 'score': 0.0021221644710749388},
 {'label': 'surprise', 'score': 0.729602038860321}]

In [28]:
import numpy as np

In [25]:
#extract the maximum probability for each emotion for each description
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
isbn = [] #created to merge with the dataset
emotion_scores = {label : [] for label in emotion_labels} #contains all of the scores for every single description

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels} #hold all the descriptions for a particular descriptions
    for prediction in predictions:#loop over the predictions
        sorted_predictions = sorted(prediction, key = lambda x:x['label'])
        for index, label in enumerate(emotion_labels): #for per each of the emotions
            per_emotion_scores[label].append(sorted_predictions[index]['score']) #extract the score and append it using the correct label
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}


In [30]:
for i in range(10): #testing for the first 10 books
    isbn.append(books['isbn13'][i]) #take the isbn13 for the book and append to the isbn list
    sentences = books['description'][i].split('.')
    predictions = classifier(sentences)
    #then pass to the emotion classifier
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

In [31]:
emotion_scores

{'anger': [np.float64(0.06413355469703674),
  np.float64(0.6126194000244141),
  np.float64(0.06413355469703674),
  np.float64(0.35148441791534424),
  np.float64(0.08141238987445831),
  np.float64(0.23222465813159943),
  np.float64(0.5381842255592346),
  np.float64(0.06413355469703674),
  np.float64(0.3006700873374939),
  np.float64(0.06413355469703674)],
 'disgust': [np.float64(0.27359139919281006),
  np.float64(0.3482847809791565),
  np.float64(0.10400658845901489),
  np.float64(0.1507224589586258),
  np.float64(0.18449527025222778),
  np.float64(0.7271749377250671),
  np.float64(0.155854731798172),
  np.float64(0.10400658845901489),
  np.float64(0.279481440782547),
  np.float64(0.17792704701423645)],
 'fear': [np.float64(0.9281681180000305),
  np.float64(0.9425276517868042),
  np.float64(0.9723208546638489),
  np.float64(0.36070606112480164),
  np.float64(0.09504339843988419),
  np.float64(0.05136274918913841),
  np.float64(0.7474274635314941),
  np.float64(0.40449756383895874),
  np

In [32]:
from tqdm import tqdm
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
isbn = [] #created to merge with the dataset
emotion_scores = {label : [] for label in emotion_labels} #contains all of the scores for every single description

for i in tqdm(range(len(books))): #testing for the first 10 books
    isbn.append(books['isbn13'][i]) #take the isbn13 for the book and append to the isbn list
    sentences = books['description'][i].split('.')
    predictions = classifier(sentences)
    #then pass to the emotion classifier
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [34:30<00:00,  2.51it/s]  


In [37]:
#pass the results of the dictionary to a pandas dataframe
emotions_df = pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn

In [38]:
emotions_df #creates a separate column for each of the emotions

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.064134,0.273591,0.928168,0.932798,0.646216,0.967158,0.729602,9780002005883
1,0.612619,0.348285,0.942528,0.704422,0.887940,0.111690,0.252546,9780002261982
2,0.064134,0.104007,0.972321,0.767238,0.549477,0.111690,0.078765,9780006178736
3,0.351484,0.150722,0.360706,0.251881,0.732685,0.111690,0.078765,9780006280897
4,0.081412,0.184495,0.095043,0.040564,0.884390,0.475880,0.078765,9780006280934
...,...,...,...,...,...,...,...,...
5192,0.148208,0.030643,0.919165,0.255172,0.853721,0.980877,0.030656,9788172235222
5193,0.064134,0.114383,0.051363,0.400263,0.883198,0.111690,0.227765,9788173031014
5194,0.009997,0.009929,0.339218,0.947779,0.375754,0.066685,0.057625,9788179921623
5195,0.064134,0.104007,0.459268,0.759456,0.951104,0.368111,0.078765,9788185300535


In [39]:
#then merge this into the books dataframe
books = pd.merge(books, emotions_df, on='isbn13')

In [40]:
#create a histogram of the emotions

In [41]:
books.to_csv('books_with_emotions.csv', index=False)