In [20]:
import re
import pandas as pd
import numpy as np
import openai
import json
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

### Calling GPT-4

In [3]:
openai.api_key = ''

In [77]:
system_prompt = '''
You are the chief editor for a leading Indian financial and business news website. You evaluate critical attributes of articles to gate keep content quality. For many attributes, you will first provide a brief analysis of 15 to 30 words, followed by assessment.

1. analysis_of_financial_or_business_news (short text) : <analyse if article pertains to finance/business or not. government policies directly impacting indian corporations or investors are ok, but not if aren't>
2. financial_or_business_news (True or False) : <True or False based on previous attribute>
3. analysis_of_relevant_for_india (short text) : <analyse if article is relevant for indians. for example international articles about 401k or small foreign companies won't be relevant for india. however changes to fed interest rates or nasdaq or important news of large multinational corporations will be relevant>
4. relevant_for_india (True or False) : <True or False based on previous attribute>
5. analysis_of_article_validity_duration (short text) : <analyse relevance duration. Be stingy: Stock fluctuations, 1 day; significant policy changes - few days or a week; educational content with references to any regulations is 30 unless there are none - in which case is timeless. International news in India has shorter lifespan. breaking news are usually not timeless; quarterly analysis or results are usually valid for a 3 days, yearly analysis or results for a weeks and a much longer one for a month.>
6. article_validity_duration (one of 1, 3, 7, 14, 30, -1) : <calculate number of days based on previous attribute. -1: timeless. 1: article is relevant only for that day. 3: for a couple of days. 7: for a week. 14: for a couple of weeks. 30: for a month>
7. analysis_of_popularity (short text) : <analyse likely popularity of article - if its for niche audience, moderate_popularity or should be part of breaking_news section, depending on number of people who will be impacted by the news and the scale of the event. foreign entities known in india but not very popular will be mostly niche or rarely moderately popular. articles targeted to very specific business or pratices will be niche. infotainment business and financial articles with some drama are likely to be more popular. articles with a list of rules without compelling story-telling will be for niche audience>
8. popularity (one of niche, moderately_popular, breaking_news) : <based on previous attribute>
9. analysis_of_article_type (short text) : <analyse if the article is majorly factual, is an opinion piece, analysis, educational or likely sponsored. factual articles pass on information on events. opinion pieces have inferences or predictions either from the author or from statements without data. analysis pieces have substantial data to justify their inferences or predictions. if an article is overly zealous on certain stock and seems like an ad, then it is sponsored>
10. article_type (one of fact, opinion, analysis, educational, sponsored) : <based on previous attribute>
11. analysis_of_article_sentiment (short_text): <analyse if the sentiment of the article is bullish, bearish or NA. balanced is NA>
12. article_sentiment (one of bull, bear, NA): <based on previous attribute>
13. headline_suggestion (short text) : <Write a headline based on the content of the article>
14. first_attempt_summary (text of 60 words) : <Generate concise, entity-dense summary. The summary should become highly dense but easily understood without the Article. Don't keep the summary too short, but limit it to no more than 60 words>
15. improved_summary (text of 60 words): <Identify contents of the article which are missing from the previous summary but are important part of the article>
16. final_summary (text of 60 words): <The finalised summary which is a mixture of first_attempt_summary and improved_summary. This summary should be very dense and also all the important information of the article and yet concise. By reading this in most cases the users need not read the article>
17. categories_hierarchy (5 semi colon seperated categories): <Hierarchy of 5 categories or keywords. Start with a generic category and make it progressively specific. Select only 5 and seperate them by (;). Don't use either single or double quotes at any cost to avoid json.loads() failure>

your response should be a json structure with all the 17 above keys without missing any key. It is very important that the response is directly readable with json.loads(). no preamble or postamble. respond in the exact following structure:

{
"analysis_of_financial_or_business_news": "",
"financial_or_business_news": "",
"analysis_of_relevant_for_india": "",
"relevant_for_india": "",
"analysis_of_article_validity_duration": "",
"article_validity_duration": "",
"analysis_of_popularity": "",
"popularity": "",
"analysis_of_article_type": "",
"article_type": "",
"analysis_of_article_sentiment": "",
"article_sentiment": "",
"headline_suggestion": "",
"first_attempt_summary": "",
"improved_summary": "",
"final_summary": "",
"top_categories": ""
}

|article_start|
'''

In [5]:
shortlisted_articles = pd.read_csv('shortlisted_articles_30_Nov_23.csv')

In [6]:
def calculate_tokens(text, encoder):  # Placeholder for your actual token calculation function
    # Your implementation will go here.
    return len(encoder.encode(text))  # Example: counting characters as tokens

def truncate_text_for_token_limit(text,encoder, token_limit):
    # First, check if the whole text is under the token limit
    if calculate_tokens(text, encoder) <= token_limit:
        return text  # The entire text is within the limit

    def is_under_limit(index):
        # Use the provided function to calculate tokens for the substring
        return calculate_tokens(text[:index], encoder) <= token_limit

    left, right = 0, len(text)
    valid_limit = 0  # This will hold the index of the last valid token position

    # Binary search to find the token limit
    while left <= right:
        mid = (left + right) // 2  # Find the midpoint
        if is_under_limit(mid):
            # If the midpoint is under the limit, store it as a valid limit
            valid_limit = mid
            left = mid + 1  # Move the left boundary to the right
        else:
            right = mid - 1  # Move the right boundary to the left

    # Find the last space before the valid_limit to ensure we're at a word boundary
    space_index = text.rfind(' ', 0, valid_limit)
    if space_index == -1:
        # If there's no space, we've hit the start of the text
        return text[:valid_limit]  # Return up to the valid limit even if mid-word

    # Return the text up to the last word within the token limit
    return text[:space_index]

In [7]:
model = 'gpt-4'

In [8]:
from datetime import datetime

In [9]:
def parse_gpt_api_response(article_id, system_prompt, user_prompt, api_response):
    # Load the response into a dictionary if it's a string
    if isinstance(api_response, str):
        api_response = json.loads(api_response)
    
    # Extract the required information from the response
    id = api_response.get('id')
    model = api_response.get('model')
    prompt_tokens = api_response['usage'].get('prompt_tokens')
    completion_tokens = api_response['usage'].get('completion_tokens')
    json_generated = False
    try:
        content = json.loads(api_response['choices'][0]['message']['content'])
        json_generated = True
    except:
        print(f'trying to get fixed json for articleId {article_id}')
        improved_api_response = openai.ChatCompletion.create(model = 'gpt-4', messages = [{'role': 'system', 'content': 'fix the following json to be readable using json.loads(). simply out the new json. no preamble and no postamble'}, {'role': 'user', 'content': api_response['choices'][0]['message']['content']}])
        try:
            content = json.loads(improved_api_response['choices'][0]['message']['content'])
            json_generated = True
        except:
            content = api_response['choices'][0]['message']['content']
            print(f'json not properly generated for {article_id} at {"{:%b %d, %Y %H:%M}".format(datetime.now())}')
    # Build the dictionary with the required information
    extracted_data = {
        'article_id': article_id,
        'system_prompt': system_prompt,
        'user_prompt': user_prompt,
        'model': model,
        'content': json.dumps(content),
        'prompt_tokens': prompt_tokens,
        'completion_tokens': completion_tokens,
        'proper_json_generated': json_generated
    }

    return extracted_data

In [10]:
from tqdm.notebook import tqdm

In [11]:
import json

In [12]:
model = 'gpt-4'
encoder = tiktoken.encoding_for_model(model)
num_system_tokens = len(encoder.encode(system_prompt))
output_tokens = 1024
total_tokens = 4096
available_tokens = total_tokens - num_system_tokens - output_tokens

In [43]:
def fetch_save_attributes(article_row):
    truncated_content = truncate_text_for_token_limit(text=article_row.full_content, encoder=encoder, token_limit=available_tokens)
    try:
        res_gpt = openai.ChatCompletion.create(model = model, messages = [{'role': 'system', 'content': system_prompt.replace('\n',' ')}, {'role': 'user', 'content': f'{truncated_content}|article_end|'}])
        parsed_res = parse_gpt_api_response(article_id=article_row.article_id, system_prompt=system_prompt, user_prompt=truncated_content, api_response=res_gpt)
        cur_df = pd.DataFrame(parsed_res, index = ['i',])
        cur_df.to_csv('gpt-4-responses-1-dec.csv',header=None, mode='a')
        return True
    except:
        return False

In [17]:
import time

In [38]:
def get_gpt4_attributes_for_article_ids(fraction):
    start_time = time.time()
    responses = {}
    valid_attributes = {}
    valid_responses = 0
    shortlisted_rows = shortlisted_articles.sample(frac=fraction)
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_save_attributes, article_row) for article_row in shortlisted_rows.itertuples()]
        
        for future in as_completed(futures):
            fetched = future.result()
            if fetched:  # Timeout or other errors
                valid_responses += 1
                continue
    print(f'done in {time.time() - start_time} seconds')
    return valid_responses

In [42]:
n = get_gpt4_attributes_for_article_ids(1)

trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 651e0533a662d76276b85936
trying to get fixed json for articleId 6

#### Correcting responses

In [47]:
actual_articles_df = shortlisted_articles[['article_id', 'full_content']]

In [50]:
gpt_responses = pd.read_csv('gpt-4-responses-1-dec.csv', header=None)

In [54]:
gpt_responses = gpt_responses.rename(columns = {2: 'prompt', 3: 'full_content', 4: 'model', 5: 'attributes', 6: 'prompt_tokens', 7: 'completion_tokens', 8: 'json_generated'})

In [57]:
gpt_responses['completion_tokens'].describe()

count    1581.000000
mean      507.270715
std        48.156720
min       327.000000
25%       475.000000
50%       505.000000
75%       538.000000
max       676.000000
Name: completion_tokens, dtype: float64

In [59]:
len(gpt_responses)

1581

In [61]:
gpt_responses.full_content.nunique()

1580

In [73]:
merged_gpt_responses = pd.merge(gpt_responses, actual_articles_df, how = 'inner', on = 'full_content')

In [76]:
len(merged_gpt_responses)

1517

In [75]:
merged_gpt_responses.to_csv('gpt-4-responses-1-dec-with-headers.csv', index = False)

In [41]:
len(shortlisted_articles)

1583

#### Analysis

In [24]:
all_article_responses = pd.read_csv('gpt-4-responses-22-nov.csv', header=None)

In [None]:
all_article_responses.rename(columns = {'0'})

In [27]:
all_article_responses[1].nunique()

825

In [30]:
all_article_responses[6].sum() * 0.03 * 80/1000

4017.768

In [31]:
all_article_responses[7].sum() * 0.06 * 80/1000

2145.3984

In [32]:
invalid_response_article_ids = []

In [33]:
resp = []
for i in tqdm(range(len(all_article_responses))):
    row = all_article_responses.iloc[i]
    try:
        resp.append(json.loads(row[5]))
    except:
        invalid_response_article_ids.append(row[1])

  0%|          | 0/828 [00:00<?, ?it/s]

In [34]:
len(invalid_response_article_ids)

121

In [56]:
resp_df = pd.DataFrame(resp[:220])

In [61]:
resp_df['is_financial_or_business_news'] = resp_df['is_financial_or_business_news'].astype('int')
resp_df['relevant_for_india'] = resp_df['relevant_for_india'].astype('int')
resp_df['article_validity_duration'] = resp_df['article_validity_duration'].astype('int')

In [63]:
resp_df.is_financial_or_business_news.mean()

0.8863636363636364

In [64]:
resp_df.relevant_for_india.mean()

0.8090909090909091

In [69]:
resp_df.groupby('popularity')['popularity'].count()

popularity
breaking_news          33
moderately_popular    115
niche                  72
Name: popularity, dtype: int64

### Rerunning for invalid article responses

In [None]:
invalid_response_article_ids

In [None]:
shortlisted_articles

In [80]:
encoder = tiktoken.encoding_for_model(model)
num_system_tokens = len(encoder.encode(system_prompt))
total_tokens = 7000
available_tokens = total_tokens - num_system_tokens
total_rows = len(shortlisted_articles)
for i in tqdm(invalid_response_article_ids[92:]):
    row = shortlisted_articles.loc[shortlisted_articles.article_id == i].iloc[0]
    truncated_content = find_token_limit(text=row.full_content, encoder=encoder, token_limit=available_tokens)
    res_gpt = openai.ChatCompletion.create(model = model, messages = [{'role': 'system', 'content': system_prompt.replace('\n',' ')}, {'role': 'user', 'content': truncated_content}])
    parsed_res = parse_gpt_api_response(article_id=row.article_id, system_prompt=system_prompt, user_prompt=truncated_content, api_response=res_gpt)
    cur_df = pd.DataFrame(parsed_res, index = ['i',])
    cur_df.to_csv('gpt-4-responses-22-nov.csv',header=None, mode='a')
    shorted_article_assessments.append(parsed_res)

  0%|          | 0/29 [00:00<?, ?it/s]

In [500]:
df1 = pd.DataFrame(shorted_article_assessments)

In [503]:
df1['json_safe_content'] = df1['content'].apply(lambda x: json.dumps(x))

In [506]:
df1.to_csv('gpt-responses.csv')

In [510]:
df1['total_cost'] = df1['prompt_tokens'] * (0.03 * 80/1000) + df1['completion_tokens'] * (0.06 * 80/1000)

In [512]:
df1.total_cost.sum()

879.1272

In [495]:
pd.DataFrame(shorted_article_assessments).to_csv('gpt-responses.csv')

#### Standardizing responses

In [2]:
import pandas as pd
import numpy as np
import json

In [5]:
df = pd.read_csv('gpt-4-responses-22-nov.csv', header=None)

In [7]:
df = df.rename(columns = {1: 'article_id', 2: 'system_prompt', 3: 'article_content', 4: 'gpt-model', 5: 'gpt-response', 6: 'input_tokens', 7: 'output_tokens'})

In [157]:
article_wise_responses = {}

In [158]:
for i in range(len(df)):
    row = df.iloc[i]
    art_id = row['article_id']
    try:
        article_wise_responses[art_id] = {}
        article_wise_responses[art_id]['response'] = json.loads(row['gpt-response'])
        article_wise_responses[art_id]['content'] = row['article_content']
    except:
        pass

In [61]:
def correct_json_using_gpt(summary_json):
    print(f'trying to get fixed json for summary')
    improved_summary_json = openai.ChatCompletion.create(model = 'gpt-4', messages = [{'role': 'system', 'content': 'fix the following json to be readable using json.loads(). simply out the new json. no preamble and no postamble'}, {'role': 'user', 'content': summary_json}])
    content = json.loads(improved_summary_json['choices'][0]['message']['content'])
    return content

In [156]:
from tqdm.notebook import tqdm
import tqdm as td

from copy import deepcopy

In [185]:
response['summary_list']

[{'missing_entities': 'Nirmala Sitharaman; US India Business Council; Digitalization impact',
  'denser_summary': 'Union Finance Minister Nirmala Sitharaman urged American businesses to invest in India, asserting that it meets the requirements of a fair and transparent economy. The invitation came during a business roundtable organized by the US India Business Council. Sitharaman emphasized on the progress India has made in terms of digitalization and economic formalization.'},
 {'missing_entities': 'Taranjit Singh Sandhu; US and India trade; National Financial Information Registry',
  'denser_summary': "Indian Ambassador to the US, Taranjit Singh Sandhu reiterated India's commitment to engagement at the G20. He also noted that the US-India bilateral trade crossed $190 billion last year. The importance of a national financial information registry was emphasized to expand credit access."},
 {'missing_entities': 'Startup policies; Public Infrastructure; Fintech Ecosystem',
  'denser_summ

In [190]:
standardized_responses = {}
errenous_summaries = []
for art_id in td.tqdm(article_wise_responses.keys()):
    response = article_wise_responses[art_id]['response']
    expected_keys_1 = ['analysis_is_financial_or_business_news', 'is_financial_or_business_news', 'analysis_of_relevant_for_india', 'relevant_for_india',
                        'analysis_of_article_validity_duration', 'article_validity_duration', 'analysis_of_popularity', 'popularity', 'analysis_of_article_type', 'article_type', 
                        'analysis_of_article_sentiment', 'article_sentiment', 'headline_suggestion']
    expected_keys_2 = ['summary_list']
    if isinstance(response, list):
        res = {}
        for ele in response:
            res.update(ele)
        response = res
        # response = deepcopy(article_wise_responses[art_id]['response'][0])
        # try:
        #     response.update(article_wise_responses[art_id]['response'][1])
        # except:
        #     pass
        # assert sorted(set(response[0].keys()).intersection(expected_keys_1)) == sorted(expected_keys_1), f"{art_id}"
        # assert set(response[1].keys()) == expected_keys_2
        # try:
        #     summaries = json.loads(response[1]['summary_list'])
        # except:
        #     errenous_summaries.append(art_id)
        #     continue
        #     # print(f'articleId is {art_id}')
        #     # summaries = correct_json_using_gpt(summary_json = response[1]['summary_list'])
        # assert isinstance(summaries, list)
        # assert len(summaries) == 3
        # for summary in summaries:
        #     assert 'missing_entities' in summary and 'denser_summary' in summary
        # standardized_responses[art_id] = []
        # standardized_responses[art_id].append(response[1])
        # standardized_responses[art_id].append(summaries)
        # elif isinstance(response, dict):
    try:
        assert sorted(set(response.keys()).intersection(set(expected_keys_1).union(set(expected_keys_2)))) == sorted(set(expected_keys_1).union(set(expected_keys_2)))
    except:
        errenous_summaries.append(art_id)
        print(f'{art_id} error 0')
        continue
        # print(f'articleId is {art_id}')
        # summaries = correct_json_using_gpt(summary_json = response['summary_list'])
    try:
        if isinstance(response['summary_list'], str):
            summaries = json.loads(response['summary_list'])
        elif isinstance(response['summary_list'], list):
            summaries = response['summary_list']
    except:
        corrected_summary = correct_json_using_gpt(response['summary_list'])
        try:
            summaries = corrected_summary
        except:
            errenous_summaries.append(art_id)
            print(f'{art_id} error 1')
            continue
    assert isinstance(summaries, list)
    try:
        assert len(summaries) == 3
    except:
        print(f'{art_id} error 2')
        errenous_summaries.append(art_id)
        continue
    for summary in summaries:
        try:
            assert 'missing_entities' in summary and ('denser_summary' in summary or 'summary_text' in summary)
            if 'summary_text' in summary:
                summary['denser_summary'] = summary.pop('summary_text')
        except:
            print(f'{art_id} error 3')
            errenous_summaries.append(art_id)
            continue
    standardized_responses[art_id] = {}
    standardized_responses[art_id]['response'] = []
    attribute_response = {}
    for key in expected_keys_1:
        attribute_response[key] = response[key]
    standardized_responses[art_id]['response'].append(attribute_response)
    standardized_responses[art_id]['response'].append(summaries)
    standardized_responses[art_id]['content'] = article_wise_responses[art_id]['content']

  0%|          | 0/825 [00:00<?, ?it/s]

trying to get fixed json for summary


  0%|          | 3/825 [00:15<1:12:02,  5.26s/it]

trying to get fixed json for summary


  0%|          | 4/825 [00:35<2:16:53, 10.00s/it]

trying to get fixed json for summary


  1%|          | 6/825 [01:04<2:45:05, 12.09s/it]

trying to get fixed json for summary


  1%|          | 8/825 [01:26<2:37:29, 11.57s/it]

trying to get fixed json for summary


  1%|▏         | 11/825 [01:37<1:47:39,  7.94s/it]

trying to get fixed json for summary


  2%|▏         | 13/825 [01:58<1:57:26,  8.68s/it]

trying to get fixed json for summary


  3%|▎         | 22/825 [02:18<59:15,  4.43s/it]  

trying to get fixed json for summary


  3%|▎         | 23/825 [02:32<1:11:53,  5.38s/it]

trying to get fixed json for summary


  3%|▎         | 25/825 [02:45<1:14:21,  5.58s/it]

trying to get fixed json for summary


  4%|▍         | 32/825 [02:58<48:01,  3.63s/it]  

trying to get fixed json for summary


  4%|▍         | 34/825 [03:08<51:45,  3.93s/it]

trying to get fixed json for summary


  4%|▍         | 36/825 [03:32<1:12:02,  5.48s/it]

trying to get fixed json for summary


  4%|▍         | 37/825 [03:54<1:39:15,  7.56s/it]

trying to get fixed json for summary


  5%|▍         | 38/825 [04:12<2:00:05,  9.16s/it]

trying to get fixed json for summary


  5%|▍         | 39/825 [04:21<1:58:29,  9.04s/it]

trying to get fixed json for summary


  5%|▌         | 43/825 [04:40<1:29:27,  6.86s/it]

trying to get fixed json for summary


  5%|▌         | 44/825 [04:59<1:53:30,  8.72s/it]

trying to get fixed json for summary


  6%|▌         | 50/825 [05:12<1:05:43,  5.09s/it]

trying to get fixed json for summary


  6%|▋         | 53/825 [05:29<1:07:24,  5.24s/it]

trying to get fixed json for summary


  7%|▋         | 54/825 [05:53<1:36:28,  7.51s/it]

trying to get fixed json for summary


  7%|▋         | 57/825 [06:11<1:28:23,  6.91s/it]

trying to get fixed json for summary


  7%|▋         | 59/825 [06:22<1:24:45,  6.64s/it]

trying to get fixed json for summary


  7%|▋         | 61/825 [06:45<1:39:39,  7.83s/it]

trying to get fixed json for summary


  8%|▊         | 62/825 [07:02<1:57:41,  9.25s/it]

trying to get fixed json for summary


  8%|▊         | 69/825 [07:25<1:10:50,  5.62s/it]

trying to get fixed json for summary


  9%|▊         | 71/825 [07:45<1:22:26,  6.56s/it]

trying to get fixed json for summary


  9%|▉         | 74/825 [07:59<1:14:30,  5.95s/it]

trying to get fixed json for summary


  9%|▉         | 75/825 [08:21<1:40:20,  8.03s/it]

trying to get fixed json for summary


  9%|▉         | 76/825 [08:47<2:14:09, 10.75s/it]

trying to get fixed json for summary


  9%|▉         | 78/825 [09:04<2:03:41,  9.94s/it]

trying to get fixed json for summary


 10%|▉         | 80/825 [09:25<2:06:09, 10.16s/it]

trying to get fixed json for summary


 10%|█         | 85/825 [09:44<1:24:10,  6.82s/it]

trying to get fixed json for summary


 11%|█         | 89/825 [10:03<1:14:19,  6.06s/it]

trying to get fixed json for summary


 11%|█         | 91/825 [10:27<1:29:47,  7.34s/it]

trying to get fixed json for summary


 12%|█▏        | 95/825 [10:46<1:17:43,  6.39s/it]

trying to get fixed json for summary


 12%|█▏        | 96/825 [11:05<1:35:49,  7.89s/it]

trying to get fixed json for summary


 12%|█▏        | 97/825 [11:30<2:04:37, 10.27s/it]

trying to get fixed json for summary


 12%|█▏        | 100/825 [11:59<2:01:40, 10.07s/it]

trying to get fixed json for summary


 12%|█▏        | 102/825 [12:16<1:56:31,  9.67s/it]

trying to get fixed json for summary


 13%|█▎        | 110/825 [12:40<1:07:34,  5.67s/it]

trying to get fixed json for summary


 14%|█▎        | 113/825 [12:56<1:05:45,  5.54s/it]

trying to get fixed json for summary


 14%|█▍        | 116/825 [13:16<1:09:12,  5.86s/it]

trying to get fixed json for summary


 15%|█▌        | 127/825 [13:28<36:17,  3.12s/it]  

trying to get fixed json for summary


 20%|██        | 166/825 [13:42<11:39,  1.06s/it]

trying to get fixed json for summary


 20%|██        | 169/825 [14:19<20:49,  1.90s/it]

trying to get fixed json for summary


 21%|██        | 173/825 [14:25<20:06,  1.85s/it]

6525a756598c6618f0608b3b error 2
trying to get fixed json for summary


 23%|██▎       | 193/825 [14:43<14:43,  1.40s/it]

trying to get fixed json for summary


 24%|██▍       | 200/825 [14:59<16:34,  1.59s/it]

trying to get fixed json for summary


 25%|██▌       | 208/825 [15:16<17:37,  1.71s/it]

trying to get fixed json for summary


 27%|██▋       | 222/825 [15:27<13:45,  1.37s/it]

trying to get fixed json for summary


 27%|██▋       | 224/825 [15:37<16:17,  1.63s/it]

trying to get fixed json for summary


 28%|██▊       | 228/825 [15:57<21:49,  2.19s/it]

trying to get fixed json for summary


 28%|██▊       | 230/825 [16:15<28:50,  2.91s/it]

trying to get fixed json for summary


 29%|██▉       | 242/825 [16:35<22:37,  2.33s/it]

trying to get fixed json for summary


 30%|██▉       | 246/825 [16:49<24:29,  2.54s/it]

trying to get fixed json for summary


 32%|███▏      | 261/825 [17:08<17:46,  1.89s/it]

trying to get fixed json for summary


 32%|███▏      | 264/825 [17:31<24:39,  2.64s/it]

trying to get fixed json for summary


 32%|███▏      | 268/825 [17:57<31:11,  3.36s/it]

trying to get fixed json for summary


 33%|███▎      | 273/825 [18:16<31:58,  3.48s/it]

trying to get fixed json for summary


 34%|███▎      | 277/825 [18:34<33:36,  3.68s/it]

trying to get fixed json for summary


 34%|███▍      | 279/825 [18:54<41:41,  4.58s/it]

trying to get fixed json for summary


 34%|███▍      | 283/825 [19:08<38:40,  4.28s/it]

trying to get fixed json for summary


 35%|███▍      | 286/825 [19:30<44:50,  4.99s/it]

trying to get fixed json for summary


 36%|███▌      | 296/825 [19:47<28:40,  3.25s/it]

trying to get fixed json for summary


 39%|███▉      | 321/825 [20:12<14:52,  1.77s/it]

trying to get fixed json for summary


 39%|███▉      | 325/825 [20:35<19:14,  2.31s/it]

trying to get fixed json for summary


 40%|████      | 330/825 [20:50<19:58,  2.42s/it]

trying to get fixed json for summary


 40%|████      | 334/825 [20:59<19:40,  2.40s/it]

trying to get fixed json for summary


 41%|████      | 335/825 [21:13<25:28,  3.12s/it]

652ebbc51e5cc42b1b139a60 error 2
trying to get fixed json for summary


 42%|████▏     | 349/825 [21:27<15:36,  1.97s/it]

trying to get fixed json for summary


 43%|████▎     | 355/825 [21:44<17:04,  2.18s/it]

trying to get fixed json for summary


 43%|████▎     | 358/825 [22:05<22:29,  2.89s/it]

trying to get fixed json for summary


 46%|████▌     | 381/825 [22:28<12:38,  1.71s/it]

trying to get fixed json for summary


 47%|████▋     | 385/825 [22:49<16:09,  2.20s/it]

trying to get fixed json for summary


 47%|████▋     | 389/825 [23:03<17:22,  2.39s/it]

65316f991e5cc42b1b140ac8 error 2
trying to get fixed json for summary


 53%|█████▎    | 435/825 [23:32<07:17,  1.12s/it]

trying to get fixed json for summary


 53%|█████▎    | 436/825 [23:49<09:32,  1.47s/it]

trying to get fixed json for summary


 54%|█████▍    | 445/825 [24:04<09:27,  1.49s/it]

trying to get fixed json for summary


 54%|█████▍    | 447/825 [24:21<12:28,  1.98s/it]

trying to get fixed json for summary


 54%|█████▍    | 448/825 [24:42<17:38,  2.81s/it]

trying to get fixed json for summary


 55%|█████▍    | 450/825 [25:01<22:08,  3.54s/it]

trying to get fixed json for summary


 56%|█████▌    | 458/825 [25:29<21:46,  3.56s/it]

trying to get fixed json for summary


 57%|█████▋    | 471/825 [25:50<15:27,  2.62s/it]

trying to get fixed json for summary


 57%|█████▋    | 472/825 [26:13<21:18,  3.62s/it]

trying to get fixed json for summary


 57%|█████▋    | 474/825 [26:35<26:36,  4.55s/it]

trying to get fixed json for summary


 58%|█████▊    | 475/825 [26:56<34:27,  5.91s/it]

trying to get fixed json for summary


 59%|█████▉    | 486/825 [27:17<20:50,  3.69s/it]

trying to get fixed json for summary


 59%|█████▉    | 487/825 [27:39<27:32,  4.89s/it]

trying to get fixed json for summary


 60%|█████▉    | 494/825 [28:14<27:15,  4.94s/it]

trying to get fixed json for summary


 60%|██████    | 495/825 [28:38<35:22,  6.43s/it]

trying to get fixed json for summary


 60%|██████    | 496/825 [28:58<42:32,  7.76s/it]

trying to get fixed json for summary


 60%|██████    | 497/825 [29:21<52:20,  9.58s/it]

trying to get fixed json for summary


 60%|██████    | 498/825 [30:09<1:23:17, 15.28s/it]

trying to get fixed json for summary


 60%|██████    | 499/825 [30:24<1:23:03, 15.29s/it]

trying to get fixed json for summary


 61%|██████    | 503/825 [30:51<59:23, 11.07s/it]  

trying to get fixed json for summary


 61%|██████▏   | 506/825 [31:17<53:58, 10.15s/it]

trying to get fixed json for summary


 62%|██████▏   | 510/825 [31:39<43:50,  8.35s/it]

trying to get fixed json for summary


 62%|██████▏   | 511/825 [31:52<46:45,  8.93s/it]

trying to get fixed json for summary


 63%|██████▎   | 520/825 [32:13<24:33,  4.83s/it]

trying to get fixed json for summary


 63%|██████▎   | 522/825 [32:26<26:03,  5.16s/it]

trying to get fixed json for summary


 63%|██████▎   | 523/825 [32:48<34:36,  6.88s/it]

trying to get fixed json for summary


 64%|██████▍   | 527/825 [33:11<31:54,  6.42s/it]

trying to get fixed json for summary


 65%|██████▍   | 536/825 [33:28<19:09,  3.98s/it]

trying to get fixed json for summary


 65%|██████▌   | 537/825 [33:49<25:33,  5.32s/it]

trying to get fixed json for summary


 65%|██████▌   | 538/825 [34:07<31:34,  6.60s/it]

trying to get fixed json for summary


 66%|██████▌   | 544/825 [34:32<25:48,  5.51s/it]

trying to get fixed json for summary


 66%|██████▌   | 545/825 [34:46<29:18,  6.28s/it]

trying to get fixed json for summary


 66%|██████▌   | 546/825 [35:00<33:48,  7.27s/it]

trying to get fixed json for summary


 68%|██████▊   | 557/825 [35:27<18:17,  4.09s/it]

trying to get fixed json for summary


 68%|██████▊   | 558/825 [35:45<23:14,  5.22s/it]

trying to get fixed json for summary


 68%|██████▊   | 561/825 [36:14<27:52,  6.33s/it]

trying to get fixed json for summary


 68%|██████▊   | 562/825 [36:35<34:43,  7.92s/it]

trying to get fixed json for summary


 69%|██████▊   | 566/825 [36:48<26:11,  6.07s/it]

trying to get fixed json for summary


 69%|██████▉   | 570/825 [37:07<23:57,  5.64s/it]

trying to get fixed json for summary


 69%|██████▉   | 571/825 [37:30<32:15,  7.62s/it]

trying to get fixed json for summary


 70%|███████   | 578/825 [37:50<20:55,  5.08s/it]

trying to get fixed json for summary


 70%|███████   | 579/825 [38:15<28:54,  7.05s/it]

trying to get fixed json for summary


 71%|███████   | 582/825 [38:33<27:15,  6.73s/it]

trying to get fixed json for summary


 71%|███████   | 583/825 [38:43<28:28,  7.06s/it]

trying to get fixed json for summary


 72%|███████▏  | 591/825 [39:03<17:19,  4.44s/it]

trying to get fixed json for summary


 72%|███████▏  | 593/825 [39:19<19:19,  5.00s/it]

trying to get fixed json for summary


 72%|███████▏  | 596/825 [39:30<17:52,  4.68s/it]

trying to get fixed json for summary


 72%|███████▏  | 597/825 [39:49<24:09,  6.36s/it]

trying to get fixed json for summary


 73%|███████▎  | 603/825 [40:15<19:45,  5.34s/it]

trying to get fixed json for summary


 73%|███████▎  | 604/825 [40:29<22:57,  6.23s/it]

trying to get fixed json for summary


 73%|███████▎  | 605/825 [40:51<30:20,  8.28s/it]

trying to get fixed json for summary


 73%|███████▎  | 606/825 [41:23<43:38, 11.96s/it]

trying to get fixed json for summary


 74%|███████▎  | 607/825 [41:51<54:12, 14.92s/it]

trying to get fixed json for summary


 74%|███████▍  | 609/825 [42:11<48:00, 13.34s/it]

trying to get fixed json for summary


 74%|███████▍  | 610/825 [42:26<48:42, 13.59s/it]

trying to get fixed json for summary


 74%|███████▍  | 613/825 [42:49<38:21, 10.86s/it]

trying to get fixed json for summary


 74%|███████▍  | 614/825 [43:11<45:35, 12.97s/it]

trying to get fixed json for summary


 75%|███████▍  | 615/825 [43:38<55:02, 15.73s/it]

trying to get fixed json for summary


 75%|███████▍  | 616/825 [44:02<1:01:23, 17.62s/it]

trying to get fixed json for summary


 75%|███████▍  | 617/825 [44:23<1:03:52, 18.43s/it]

trying to get fixed json for summary


 75%|███████▌  | 620/825 [44:40<40:52, 11.96s/it]  

trying to get fixed json for summary


 75%|███████▌  | 621/825 [45:08<51:19, 15.10s/it]

trying to get fixed json for summary


 76%|███████▌  | 627/825 [45:35<27:40,  8.39s/it]

trying to get fixed json for summary


 76%|███████▌  | 629/825 [45:53<27:56,  8.56s/it]

trying to get fixed json for summary


 76%|███████▋  | 630/825 [46:09<31:26,  9.67s/it]

trying to get fixed json for summary


 77%|███████▋  | 636/825 [46:21<17:13,  5.47s/it]

trying to get fixed json for summary


 77%|███████▋  | 638/825 [46:36<18:22,  5.89s/it]

trying to get fixed json for summary


 78%|███████▊  | 640/825 [46:56<20:55,  6.79s/it]

trying to get fixed json for summary


 78%|███████▊  | 645/825 [47:15<16:14,  5.41s/it]

trying to get fixed json for summary


 79%|███████▊  | 648/825 [47:38<17:53,  6.06s/it]

trying to get fixed json for summary


 79%|███████▊  | 649/825 [47:52<20:27,  6.97s/it]

trying to get fixed json for summary


 79%|███████▉  | 652/825 [48:15<20:51,  7.24s/it]

trying to get fixed json for summary


 79%|███████▉  | 655/825 [48:27<17:37,  6.22s/it]

trying to get fixed json for summary


 80%|███████▉  | 657/825 [48:42<18:08,  6.48s/it]

trying to get fixed json for summary


 80%|███████▉  | 659/825 [49:14<24:39,  8.91s/it]

trying to get fixed json for summary


 80%|████████  | 661/825 [49:30<23:45,  8.69s/it]

trying to get fixed json for summary


 80%|████████  | 662/825 [49:57<31:28, 11.59s/it]

trying to get fixed json for summary


 82%|████████▏ | 675/825 [50:17<10:12,  4.09s/it]

trying to get fixed json for summary


 82%|████████▏ | 678/825 [50:53<13:50,  5.65s/it]

65316ef41e5cc42b1b13ea81 error 2
trying to get fixed json for summary


 83%|████████▎ | 681/825 [51:13<13:57,  5.82s/it]

trying to get fixed json for summary


 83%|████████▎ | 683/825 [51:37<16:28,  6.96s/it]

trying to get fixed json for summary


 83%|████████▎ | 686/825 [51:58<16:02,  6.93s/it]

trying to get fixed json for summary


 83%|████████▎ | 688/825 [52:18<17:19,  7.59s/it]

trying to get fixed json for summary


 84%|████████▎ | 689/825 [52:36<20:15,  8.94s/it]

trying to get fixed json for summary


 84%|████████▎ | 690/825 [52:55<23:23, 10.40s/it]

trying to get fixed json for summary


 84%|████████▍ | 691/825 [53:22<29:52, 13.38s/it]

trying to get fixed json for summary


 84%|████████▍ | 695/825 [53:36<18:04,  8.34s/it]

trying to get fixed json for summary


 84%|████████▍ | 697/825 [53:58<19:19,  9.06s/it]

trying to get fixed json for summary


 85%|████████▌ | 705/825 [54:12<09:18,  4.66s/it]

trying to get fixed json for summary


 86%|████████▌ | 707/825 [54:28<10:19,  5.25s/it]

trying to get fixed json for summary


 86%|████████▌ | 708/825 [54:47<13:18,  6.82s/it]

trying to get fixed json for summary


 87%|████████▋ | 718/825 [55:01<06:09,  3.46s/it]

trying to get fixed json for summary


 87%|████████▋ | 719/825 [55:18<08:06,  4.59s/it]

trying to get fixed json for summary


 87%|████████▋ | 721/825 [55:38<09:42,  5.60s/it]

trying to get fixed json for summary


 88%|████████▊ | 722/825 [55:59<12:42,  7.40s/it]

trying to get fixed json for summary


 88%|████████▊ | 724/825 [56:14<12:26,  7.39s/it]

trying to get fixed json for summary


 88%|████████▊ | 725/825 [56:33<15:25,  9.26s/it]

trying to get fixed json for summary


 88%|████████▊ | 726/825 [56:49<17:13, 10.44s/it]

trying to get fixed json for summary


 88%|████████▊ | 728/825 [57:10<16:57, 10.49s/it]

trying to get fixed json for summary


 88%|████████▊ | 730/825 [57:19<13:33,  8.56s/it]

trying to get fixed json for summary


 89%|████████▊ | 732/825 [57:38<13:33,  8.75s/it]

trying to get fixed json for summary


 89%|████████▉ | 733/825 [57:56<16:12, 10.58s/it]

trying to get fixed json for summary


 89%|████████▉ | 736/825 [58:14<12:33,  8.47s/it]

trying to get fixed json for summary


 89%|████████▉ | 738/825 [58:29<11:59,  8.26s/it]

trying to get fixed json for summary


 90%|████████▉ | 739/825 [58:53<15:45, 11.00s/it]

trying to get fixed json for summary


 91%|█████████ | 748/825 [59:13<06:17,  4.90s/it]

trying to get fixed json for summary


 91%|█████████ | 750/825 [59:36<07:37,  6.10s/it]

trying to get fixed json for summary


 92%|█████████▏| 757/825 [59:53<04:57,  4.37s/it]

trying to get fixed json for summary


 92%|█████████▏| 763/825 [1:00:19<04:30,  4.36s/it]

trying to get fixed json for summary


 93%|█████████▎| 764/825 [1:00:37<05:30,  5.42s/it]

trying to get fixed json for summary


 93%|█████████▎| 765/825 [1:00:53<06:34,  6.58s/it]

trying to get fixed json for summary


 93%|█████████▎| 768/825 [1:01:14<06:18,  6.64s/it]

trying to get fixed json for summary


 94%|█████████▎| 773/825 [1:01:30<04:30,  5.20s/it]

trying to get fixed json for summary


 94%|█████████▍| 778/825 [1:01:44<03:23,  4.34s/it]

trying to get fixed json for summary


 94%|█████████▍| 779/825 [1:02:06<04:34,  5.98s/it]

trying to get fixed json for summary


 95%|█████████▍| 780/825 [1:02:26<05:49,  7.77s/it]

trying to get fixed json for summary


 95%|█████████▍| 781/825 [1:02:44<06:46,  9.24s/it]

trying to get fixed json for summary


 95%|█████████▍| 782/825 [1:02:57<07:02,  9.82s/it]

trying to get fixed json for summary


 95%|█████████▍| 783/825 [1:03:29<10:03, 14.38s/it]

trying to get fixed json for summary


 95%|█████████▌| 784/825 [1:03:50<10:45, 15.74s/it]

trying to get fixed json for summary


 96%|█████████▌| 788/825 [1:04:07<05:45,  9.34s/it]

trying to get fixed json for summary


 96%|█████████▋| 795/825 [1:04:31<02:55,  5.86s/it]

trying to get fixed json for summary


 97%|█████████▋| 797/825 [1:04:55<03:19,  7.11s/it]

trying to get fixed json for summary


 97%|█████████▋| 800/825 [1:05:11<02:44,  6.60s/it]

trying to get fixed json for summary


 97%|█████████▋| 803/825 [1:05:30<02:22,  6.48s/it]

trying to get fixed json for summary


 97%|█████████▋| 804/825 [1:05:45<02:38,  7.56s/it]

trying to get fixed json for summary


 98%|█████████▊| 805/825 [1:06:04<03:03,  9.20s/it]

trying to get fixed json for summary


 98%|█████████▊| 810/825 [1:06:22<01:35,  6.34s/it]

trying to get fixed json for summary


 99%|█████████▉| 816/825 [1:06:37<00:40,  4.54s/it]

trying to get fixed json for summary


 99%|█████████▉| 817/825 [1:06:55<00:46,  5.86s/it]

trying to get fixed json for summary


 99%|█████████▉| 818/825 [1:07:17<00:55,  7.90s/it]

trying to get fixed json for summary


100%|█████████▉| 821/825 [1:07:31<00:27,  6.81s/it]

trying to get fixed json for summary


100%|█████████▉| 822/825 [1:07:55<00:27,  9.17s/it]

trying to get fixed json for summary


100%|██████████| 825/825 [1:08:09<00:00,  4.96s/it]


In [163]:
list(standardized_responses.keys())[-20:]

['6555ccc44b13023f934acd61',
 '651e0c73a662d76276b86a15',
 '651dd71da662d76276b7e3c2',
 '6555cb9f4b13023f9349fa71',
 '6555cba44b13023f9349fdd1',
 '6555cab84b13023f93495f7c',
 '6555c7ec4b13023f9348ee4a',
 '654cbbcadc4fa72a6c403e10',
 '6555c0084b13023f9348c97b',
 '6555ccd14b13023f934ad8ab',
 '651df9b1a662d76276b83c5d',
 '6555cbf64b13023f934a3718',
 '65316f1c1e5cc42b1b13efc1',
 '651e088ba662d76276b86119',
 '653168bd1e5cc42b1b13b78d',
 '655c5595eee55a44e0ac1e51',
 '6555c84a4b13023f934924ce',
 '651dca7ca662d76276b7c1c9',
 '651df878a662d76276b83979',
 '6555c80e4b13023f934901a6']

In [191]:
len(standardized_responses.keys())

821

In [192]:
len(errenous_summaries)

4

In [195]:
test_article_ids = list(df.iloc[:101].article_id.values)

In [196]:
len(set(test_article_ids).intersection(set(standardized_responses.keys())))

100

In [197]:
train_article_ids = [x for x in standardized_responses.keys() if x not in test_article_ids]

In [201]:
len(train_article_ids)

721

In [202]:
train_standardized_responses = {k: standardized_responses[k] for k in train_article_ids}

In [203]:
len(train_standardized_responses)

721

In [204]:
with open('train_set_full_27_nov.json', 'w') as f:
    json.dump(train_standardized_responses, f)

In [205]:
test_standardized_responses = {k: standardized_responses[k] for k in test_article_ids}

In [206]:
with open('test_set_full_27_nov.json', 'w') as f:
    json.dump(test_standardized_responses, f)