In [23]:
import pandas as pd
import numpy as np
from transformers import pipeline
from tqdm import tqdm










In [2]:
books = pd.read_csv("/Users/venkatvive/Documents/projects/reading-recommender/data/processed/books_processed.csv")

In [5]:
classifier = pipeline('text-classification',
                       model="j-hartmann/emotion-english-distilroberta-base",
                         top_k=None,
                         device = "mps")











Device set to use mps


In [8]:
sentences = books['description'][0].split(".")
predictions = classifier(sentences)

In [10]:
sorted(predictions[0], key=lambda x: x['label'])

[{'label': 'anger', 'score': 0.009156349115073681},
 {'label': 'disgust', 'score': 0.002628474263474345},
 {'label': 'fear', 'score': 0.06816212832927704},
 {'label': 'joy', 'score': 0.04794240742921829},
 {'label': 'neutral', 'score': 0.1403856724500656},
 {'label': 'sadness', 'score': 0.0021221607457846403},
 {'label': 'surprise', 'score': 0.7296027541160583}]

In [11]:
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

In [12]:
isbn = []


In [13]:
emotion_scores = {label: [] for label in emotion_labels}

In [18]:
def calculate_max_emotion(description, emotion_labels):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for d in description:
        sorted_predictions = sorted(d, key=lambda x: x['label'])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]['score'])

    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}


In [24]:
full_isbn = []
full_emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    full_isbn.append(books['isbn13'][i])
    sentences = books['description'][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion(predictions, emotion_labels)
    for label in emotion_labels:
        full_emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [6:59:38<00:00,  4.84s/it]     


In [28]:
len(full_emotion_scores)

7

In [29]:
emotions_df = pd.DataFrame(full_emotion_scores)
emotions_df['isbn13'] = full_isbn

In [33]:
emotions_df.shape

(5197, 8)

In [34]:
books = pd.merge(books, emotions_df, on='isbn13')

In [35]:
books.head(10)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,anger,disgust,fear,joy,neutral,sadness,surprise
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,0.064134,0.273591,0.928169,0.932798,0.646215,0.967158,0.729603
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web A Novel,9780002261982 A new 'Christie for Christmas' -...,0.612619,0.348286,0.942528,0.704421,0.88794,0.11169,0.252545
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",0.064134,0.104007,0.972321,0.767238,0.549477,0.11169,0.078765
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,0.351484,0.150722,0.360707,0.251881,0.732686,0.11169,0.078765
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",0.081412,0.184495,0.095043,0.040564,0.884389,0.475881,0.078765
5,9780006380832,0006380832,Empires of the Monsoon,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0,Empires of the Monsoon A History of the Indian...,9780006380832 Until Vasco da Gama discovered t...,0.232225,0.727175,0.051363,0.043376,0.621393,0.11169,0.271902
6,9780006470229,000647022X,The Gap Into Madness,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",http://books.google.com/books/content?id=4oXav...,A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0,The Gap Into Madness Chaos and Order,9780006470229 A new-cover reissue of the fourt...,0.538185,0.155855,0.747428,0.872566,0.712195,0.407999,0.078765
7,9780006472612,0006472613,Master of the Game,Sidney Sheldon,Adventure stories,http://books.google.com/books/content?id=TkTYp...,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,489.0,43540.0,Master of the Game,9780006472612 Kate Blackwell is an enigma and ...,0.064134,0.104007,0.404496,0.040564,0.549477,0.820283,0.234488
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079 Tricked once more by his wily ha...,0.30067,0.279481,0.915524,0.040564,0.840289,0.354459,0.135615
9,9780006483014,0006483011,The Once and Future King,Terence Hanbury White,Arthurian romances,http://books.google.com/books/content?id=Jx6Bv...,An omnibus volume of the author's complete sto...,1996.0,4.04,823.0,2805.0,The Once and Future King,9780006483014 An omnibus volume of the author'...,0.064134,0.177927,0.051363,0.040564,0.860372,0.11169,0.078765


In [39]:
books.to_csv("/Users/venkatvive/Documents/projects/reading-recommender/data/processed/books_with_emotions.csv", index=False)

In [52]:
def isPalindrome(x: int) -> bool:
        x_string = str(x)
        print(len(x_string))
        if len(x_string) == 1:
            return True 
        elif x % 2 == 0:
            return False 
        elif x < 0: 
            return False
        else: 
            for i in range(len(x_string)//2):
                print(x_string[i], x_string[len(x_string)-1-i])
                if x_string[i] != x_string[-i]:
                    return False
            return True

In [89]:
def romanToInt(s: str) -> int:
        roman_mapping = {
            "I":1,
            "IV":4,
            "V":5,
            "IX":9,
            "X":10,
            "XL":40,
            "L":50,
            "XC":90,
            "C":100,
            "CD":400,
            "D":500,
            "CM":900,
            "M":1000,
        }
        output = 0
        while len(s) > 0:
            print(s)
            if s[0] < s[1]: 
                output+=roman_mapping[s[0:2]]
                s = s.replace(s[0:2],'')
            else:
                output+=roman_mapping[s[0]]
                s = s.replace(s[0],'')
        return output

In [120]:
import numpy as np
def feature_scaling(data: np.ndarray) -> (np.ndarray, np.ndarray):
    # Your code here
    standardized_data = []
    normalized_data = []
    transposed_data = []
    N = len(data) # number of samples / rows
    D = len(data[0]) # number of features / dimensions / cols

    # Create a transposed array
    for f in range(D):
        feature_transposed = []
        for s in range(N):
            feature_transposed.append(data[s][f])
        transposed_data.append(feature_transposed)
    print(transposed_data)
    standardized_transpose = []
    normalized_transpose = []
    #retrieve values for standardization and normalization
    for f in range(D):
        mean = sum(transposed_data[f])/N
        max_x = max(transposed_data[f])
        min_x = min(transposed_data[f])
        stdev = 0
        std_numerator = 0
        for s in range(N):
            std_numerator += (transposed_data[f][s] - mean)**2
            print(std_numerator)
        stdev = ((std_numerator)/N)**0.5
        standardized_feature = []
        normalized_feature = []
        for s in range(N):
            standardized_feature.append((transposed_data[f][s]-mean)/stdev)
            normalized_feature.append((transposed_data[f][s]-min_x)/(max_x-min_x))
        standardized_transpose.append(standardized_feature)
        normalized_transpose.append(normalized_feature)

    #Lastly, transpose the arrays
    for s in range(N):
        standardized_sample = []
        normalized_sample = []
        for f in range(D):
            standardized_sample.append(standardized_transpose[f][s])
            normalized_sample.append(normalized_transpose[f][s])
        standardized_data.append(standardized_sample)
        normalized_data.append(normalized_sample)

    return standardized_data, normalized_data

In [122]:
# Definition for singly-linked list.
class ListNode:
     def __init__(self, val=0, next=None):
         self.val = val
         self.next = next
class Solution:
    def mergeTwoLists(self, list1: Optional[ListNode], list2: Optional[ListNode]) -> Optional[ListNode]:
        

SyntaxError: incomplete input (3018563861.py, line 7)

In [125]:
data = pd.read_csv('/Users/venkatvive/Documents/projects/reading-recommender/data/processed/amazon_ratings.csv')

In [126]:
data.columns

Index(['user_id', 'isbn', 'rating'], dtype='object')