In [5]:
import csv
from time import time

import pandas as pd
from pandas import DataFrame

from Utils.ibm_nl import check_texts

In [182]:
def drop_and_write_header():
    
    with open('orgs_ibm_checked.csv','w',encoding='utf8') as orgs_file:
        field_names = [
            'uuid',
            'organization',
            'sentiment_score',
            'sentiment_label',
            'org_relevance',
            'org_count',
            'confidence',
            'orgs_in_doc',
        ]
        writer = csv.DictWriter(orgs_file,fieldnames=field_names)
        writer.writeheader()
    counter = {
        # Number of organizations with confidence greater than 0.6 and number of organizations 
        # in same doc less than or equal to 5.
        'le_5_orgs_conf_gt_06': {
            'negative': 0, 
            'positive': 0,
            'neutral': 0,
        }, 
        'others': {
            'negative': 0, 
            'positive': 0,
            'neutral': 0,
        }
    }
    
    with open('docs_counter.tmp','w') as counter_file:
        json.dump(counter,counter_file)


In [274]:
def write_orgs(orgs:List[Dict]):
    
    with open('docs_counter.tmp','r') as counter_file:
        counter = json.load(counter_file)
        
    with open('orgs_ibm_checked.csv','a',encoding='utf8') as orgs_file:
        field_names = [
            'uuid',
            'organization',
            'sentiment_score',
            'sentiment_label',
            'org_relevance',
            'org_count',
            'confidence',
            'orgs_in_doc',
        ]
        writer = csv.DictWriter(orgs_file,fieldnames=field_names)
        for org in orgs:
            sentiment = org.get('sentiment_label')
            confidence = org.get('confidence')
            orgs_in_doc = org.get('orgs_in_doc')
            if sentiment and confidence and orgs_in_doc:
                if (org['confidence'] >= 0.6) and (org['orgs_in_doc'] <= 5):
                    counter['le_5_orgs_conf_gt_06'][sentiment] += 1
                else:
                    counter['others'][sentiment] += 1
            writer.writerow(org)
        print(counter,'\n')
    
    with open('docs_counter.tmp','w') as counter_file:
        json.dump(counter,counter_file)

In [201]:
docs = pd.read_csv('csvs/docs_for_normalized_orgs.csv')

In [208]:
def get_rest_texts(docs: DataFrame, processed_uids: List[str]=[]):
    
    rest_texts = []
    rest_docs = docs[~docs.uuid.isin(processed_uids)]
    
    def add_text(row, list_):
        list_.append((row['uuid'],row['text']))
        
    rest_docs.apply(add_text, axis=1, args=[rest_texts])
    
    return rest_texts

In [232]:
def get_done_texts():
    
    done_texts = set(pd.read_csv('orgs_ibm_checked.csv')['uuid'])
    
    return list(done_texts)

In [268]:
docs = pd.read_csv('csvs/docs_for_normalized_orgs.csv')

In [278]:
rest_docs = get_rest_texts(docs, get_done_texts())
total = len(rest_docs)
step = min(1000, total)
start = time()

for i in range(0,total,step):
    batch_start = time()
    batch = rest_docs[i:i+step]
    result = await check_texts(batch, 20)
    write_orgs(result)
    print(f'{i+step} of {total} processed\n',f'Batch time: {time()-batch_start} s\n',f'Total time: {time()-start} s\n')


{'le_5_orgs_conf_gt_06': {'negative': 2956, 'positive': 2305, 'neutral': 1911}, 'others': {'negative': 5349, 'positive': 10391, 'neutral': 12555}} 

1000 of 46233 processed
 Batch time: 87.81618332862854 s
 Total time: 87.81630444526672 s


{'le_5_orgs_conf_gt_06': {'negative': 3531, 'positive': 2689, 'neutral': 2207}, 'others': {'negative': 6353, 'positive': 12151, 'neutral': 14231}} 

2000 of 46233 processed
 Batch time: 84.93735384941101 s
 Total time: 172.75372791290283 s


{'le_5_orgs_conf_gt_06': {'negative': 4133, 'positive': 3028, 'neutral': 2521}, 'others': {'negative': 7247, 'positive': 14006, 'neutral': 16195}} 

3000 of 46233 processed
 Batch time: 92.21321439743042 s
 Total time: 264.9672031402588 s


{'le_5_orgs_conf_gt_06': {'negative': 4680, 'positive': 3390, 'neutral': 2819}, 'others': {'negative': 8256, 'positive': 15770, 'neutral': 18316}} 

4000 of 46233 processed
 Batch time: 98.89043498039246 s
 Total time: 363.85780477523804 s


{'le_5_orgs_conf_gt_06': {'negati


{'le_5_orgs_conf_gt_06': {'negative': 27268, 'positive': 18504, 'neutral': 15797}, 'others': {'negative': 54923, 'positive': 86365, 'neutral': 105878}} 

47000 of 46233 processed
 Batch time: 22.340089082717896 s
 Total time: 4280.326284170151 s



In [280]:
orgs = pd.read_csv('orgs_ibm_checked.csv')

In [282]:
orgs.describe()

Unnamed: 0,sentiment_score,org_relevance,org_count,confidence,orgs_in_doc
count,308736.0,308735.0,308736.0,308735.0,308736.0
mean,0.063065,0.318774,1.903801,0.83242,10.376879
std,0.54428,0.259042,2.439571,0.215769,6.666681
min,-0.999882,0.0,1.0,0.005197,1.0
25%,-0.333363,0.119981,1.0,0.718066,6.0
50%,0.0,0.248588,1.0,0.942977,9.0
75%,0.577635,0.448576,2.0,0.996309,14.0
max,0.99993,0.9999,52.0,1.0,47.0
