In [None]:
import sys
import semantria
import time
import pandas as pd
import gender_guesser.detector as gender
import html2text
from tqdm import tqdm

# Consumer key and secret
consumerKey = "b28c7f8e-b506-4355-a484-6f77190eaad7"
consumerSecret = "0fbae06e-b8a5-4240-b0b1-2efaeb2db89b"

# Initializes new session with the keys.
session = semantria.Session(consumerKey, consumerSecret)

# Set Global Variables
subscription = session.getSubscription()
gender_det = gender.Detector()
h = html2text.HTML2Text() #used to clean html text format
h.ignore_links = True

In [None]:
# Given a full name, guessGender returns girl, boy or none if gender is undefined 
def guessGender(name):
    first = name.split()[0]
    guess = gender_det.get_gender(first)
    if guess == "female" or guess == "mostly_female":
        return "girl"
    elif guess == "male" or guess == "mostly_male":
        return "boy"
    else:
        return None
    

In [None]:
#Given a semantria Doc, getSemantriaResults will return a jason set with the results 
def getSemantriaResults(doc):
    #send documents to semantria
    status = session.queueBatch(doc)
    if not status in [200, 202]:
        print("error in processing")
        return None
        
    #retrieve documents
    response = []
    response = session.getProcessedDocuments()
    #wait 3 seconds and try again in case the results are still being processed 
    while len(response) < len(doc):
        time.sleep(3)
        response = session.getProcessedDocuments()
    
    resultsSet = []
    for item in response:
        resultsSet.append(item)
    return resultsSet   

In [None]:
#normalize the values of a dictionary(assign value 0 to 100)
def normalizeDictionary(dic):
    total = sum(dic.values())
    for key in sorted(dic.keys()):
        try:
            dic[key] = round((100.0 / total) * dic[key], 3)
        except:
            dic[key] = 0.000
    return dic  

In [None]:
#declare the name of the file to be processed - Make sure this file has the columns:
    #character_in_description - character_in_review - setting_in_description - setting_in_review
csv_file = '100_reviews_for_documentation.csv'
bookReviews = pd.read_csv(csv_file)
#declare the name of the outputFile
output_file = 'first100_char_set_evaluation.csv'

In [None]:
#this block just outputs a sample of the input file to test the file before processing 
print(len(bookReviews))
bookReviews.head()

In [None]:
#This block create the data for the documents from the csv input
#It assumes that the csv cell with the reviews are in a python list or dictionary format
documents = dict()
reviews = []
k = 0

for index, row in tqdm(bookReviews.iterrows()):
    
    documents[k] = list()
    description = h.handle(row['description'])
    #Semantria is limited to 2040 words per document
    if(len(description) > 2040):
        description = description[0:2040]
    #the discription is always the first position in the dictionary
    documents[k].append({'id': str(0), 'text': description, "metadata": {"Author": row['authors'], "Book Name": row['titles']}})
    
    try:
        n = 1
        reviews = eval(row['review_text'])
        for review in reviews:
            #Semantria is limited to 2040 words per document
            if(len(review) > 2040):
                review = review[0:2040] 
            documents[k].append({'id': str(n), 'text': review, "metadata": {"Author": row['authors'], "Book Name": row['titles']}})
            n = n + 1
            # a max of 50 reviews
            if n == 50:
                break
    #if the cells with the reviews have a wrong formating, it will throw an exception
    except:
        print("Error in processing reviews for row# " + str(index))
    k = k + 1



In [None]:
#This block just test the data extraction. To check, put an index(row number of csv) and check if the extraction is accurate
docIndex = 10
print(docIndex)
j = []
for doc in documents[docIndex]:
    j.append(len(doc['text']))
j.sort()
print(j)
print()
print((documents[docIndex][0]["metadata"]))
print()
print((documents[docIndex][0]["text"]))

In [None]:
#Brain Block - get the results from the semantria function and saves the results to the ouput file
for index in tqdm(range(len(bookReviews))):
    
    CharacterRev = dict()
    SettingRev = dict()
    CharDescrip = dict()
    SetDescrip = dict()
    
    results = getSemantriaResults(documents[index])
    results.sort(key = lambda x: (len (x['id']), x['id'])) 
    for data in results:
        
        #get the last name of the author for avoid assigning it to a character
        fullName = data['metadata']['Author']
        names = fullName.split()
        last = names[0]
        if len(names) != 1:
            first, *middle, last = fullName.split()
        author = last
        
        #get the characters
        if "entities" in data:
            for entity in data["entities"]: 
                if((entity['type'] == 'user' or entity['entity_type'] == 'Person') and not (author in entity['title'].split())):

                    title = entity['title']
                    if(entity['entity_type'] == 'Person'):
                        personGender = guessGender(title)
                        if personGender == None:
                            continue
                        title = personGender

                    for location in entity['mentions'][0]['locations']:
                        if(data['id'] == '0'):
                            if title in CharDescrip:
                                CharDescrip[title] += 1
                            else:
                                CharDescrip[title] = 1
                        else:                    
                            if title in CharacterRev:
                                CharacterRev[title] += 1
                            else:
                                CharacterRev[title] = 1
        #get the settings 
        if "topics" in data:
            for topic in data["topics"]:
                # Semantria treats the settings as "topics"
                if(topic['type'] == "concept"):
                    if(data['id'] == '0'):
                        if topic['title'] in SetDescrip:
                            SetDescrip[topic['title']] += 1
                        else:
                            SetDescrip[topic['title']] = 1
                    else:
                        if topic['title'] in SettingRev:
                            SettingRev[topic['title']] += 1
                        else:
                            SettingRev[topic['title']] = 1
    
    #save the results per book
    bookReviews.loc[index, 'character_in_description'] = str(normalizeDictionary(CharDescrip))
    bookReviews.loc[index, 'character_in_review'] = str(normalizeDictionary(CharacterRev))
    bookReviews.loc[index, 'setting_in_description'] = str(normalizeDictionary(SetDescrip))
    bookReviews.loc[index, 'setting_in_review'] = str(normalizeDictionary(SettingRev))
    bookReviews.to_csv(output_file)
                            
print("Completed")
    