# Init

In [197]:
import json
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import requests

# pprint is used to format the JSON response
from pprint import pprint
from tqdm.auto import tqdm

key = "7f08d854fe9d4919b321c74378c984ce"
endpoint = "https://earnings-call.cognitiveservices.azure.com"

sentiment_url = endpoint + "/text/analytics/v3.0/sentiment"
headers = {'Content-Type': 'application/json', 'Ocp-Apim-Subscription-Key': key}

# parse

## get response

In [213]:
targets_df = pd.read_feather('data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text_norm.feather')

text_present = targets_df[['docid', 'text_present']]
text_qa = targets_df[['docid', 'text_qa']]

In [216]:
%%time

def get_response(text_df):

    results = []
    request_docs = []
    error_num = 0
        
    for doc_i, (_, docid, text) in enumerate(tqdm(text_df.itertuples(), total=len(text_df))):
        try:
            text_chunks = list(chunks(text, 5120))
            for chunk_idx, text_chunk in enumerate(text_chunks):
                
                # if one chunk contains less than 10 characters, skip it.
                if len(text_chunk)<=10:
                    continue
                
                # add requests if less than 10 documents
                request_docs.append({'id':f'{docid},{chunk_idx}', 'language': 'en', 'text': text_chunk}) 

                # sanity check
                assert len(request_docs)<=10, f'There are more than 10 docs in a request at docid={docid}, chunk_idx={chunk_idx}!'

                # otherwise, send request
                if len(request_docs)==10 or (doc_i==len(text_present)-1 and chunk_idx==len(text_chunks)-1):

                    
                    # get response
                    response = requests.post(sentiment_url, headers=headers, json={'documents': request_docs})
                    response = response.json()

                    # check parse errors
                    assert len(response['errors']) == 0, f"There are errors, please check!\n{response['errors']}"

                    # collect results
                    results.extend(response['documents'])

                    # reset requests
                    request_docs = []

        except Exception as e:
            # print exceptions
            error_num += 1
            print(f'error_num={error_num}. Exception caught at docid={docid}, chunk_idx={chunk_idx}!\n')
            print(e)
            
            # if too many errors, stop
            if error_num > 1:
                print(f'error_num > 5, stop loop!')
                break
    
    print(f'Exceptions encounted = {error_num}')
    return results
        
    

# present_response = get_response(text_present)
# sv('present_response')

qa_response = get_response(text_qa)
sv('qa_response')

HBox(children=(FloatProgress(value=0.0, max=21763.0), HTML(value='')))


Exceptions encounted = 0
Wall time: 6h 7min 3s


## format response

In [218]:
def format_response(response, save_name):
    '''
    Args:
        response: dict.
    '''
    sentiment = []

    print(f'Parsing response...')
    for re in response:
        docid, chunk_i = re['id'].split(',')
        chunk_sentiment = re['sentiment']
        chunk_positive = re['confidenceScores']['positive']
        chunk_neutral = re['confidenceScores']['neutral']
        chunk_negative = re['confidenceScores']['negative']

        for sentence_i, sentence in enumerate(re['sentences']):
            sentence_sentiment = sentence['sentiment']
            sentence_positive = sentence['confidenceScores']['positive']
            sentence_neutral = sentence['confidenceScores']['neutral']
            sentence_negative = sentence['confidenceScores']['negative']
            sentence = sentence['text']

            sentiment.append((docid, chunk_i, chunk_sentiment, chunk_positive, chunk_neutral, 
                              chunk_negative, sentence_i, sentence_sentiment, sentence_positive, \
                              sentence_neutral, sentence_negative, sentence))
            
    # save as parquet
    print('Saving as parquet...')
    sentiment = pd.DataFrame(sentiment, columns=['docid', 'chunk_i', 'chunk_sentiment', 'chunk_positive', 'chunk_neutral', 'chunk_negative', 'sentence_i', 'sentence_sentiment', 'sentence_positive', 'sentence_neutral', 'sentence_negative', 'sentence'])
    sentiment_df = pa.Table.from_pandas(sentiment, preserve_index=False)
    pq.write_table(sentiment_df, f"data/{save_name}.parquet")
    print(f'File saved as "{save_name}.parquet"')
    
    # return formated df
    return sentiment_df

# present_sentiment = format_response(present_response, 'present_sentiment')
qa_sentiment = format_response(qa_response, 'qa_sentiment')

Parsing response...
Saving as parquet...
File saved as "qa_sentiment.parquet"
