In [1]:
import re
import pandas as pd
import numpy as np
import openai
import json
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [2]:
def calculate_tokens(text, encoder):  # Placeholder for your actual token calculation function
    # Your implementation will go here.
    return len(encoder.encode(text))  # Example: counting characters as tokens

def truncate_text_for_token_limit(text,encoder, token_limit):
    # First, check if the whole text is under the token limit
    if calculate_tokens(text, encoder) <= token_limit:
        return text  # The entire text is within the limit

    def is_under_limit(index):
        # Use the provided function to calculate tokens for the substring
        return calculate_tokens(text[:index], encoder) <= token_limit

    left, right = 0, len(text)
    valid_limit = 0  # This will hold the index of the last valid token position

    # Binary search to find the token limit
    while left <= right:
        mid = (left + right) // 2  # Find the midpoint
        if is_under_limit(mid):
            # If the midpoint is under the limit, store it as a valid limit
            valid_limit = mid
            left = mid + 1  # Move the left boundary to the right
        else:
            right = mid - 1  # Move the right boundary to the left

    # Find the last space before the valid_limit to ensure we're at a word boundary
    space_index = text.rfind(' ', 0, valid_limit)
    if space_index == -1:
        # If there's no space, we've hit the start of the text
        return text[:valid_limit]  # Return up to the valid limit even if mid-word

    # Return the text up to the last word within the token limit
    return text[:space_index]

In [3]:
parent_folder = '/Users/username/Desktop/ML/Recommendations/arcane/'
from hydra import compose, initialize
import os
import xml.etree.ElementTree as ET

tree = ET.parse('../../conf/application.run.xml')
root = tree.getroot()

envs_element = root.find('./configuration/envs')
for variable in envs_element.findall('env'):
    name = variable.get('name')
    value = variable.get('value')
    os.environ[name] = value

import sys
sys.path.append('/Users/username/Desktop/ML/Recommendations/arcane/')

import numpy as np
import pandas as pd
import json
from sql.articles.MongoDBArticle import MongoDBArticle
from sql.PostgresDatabaseOperation import PostgresDatabaseOperation

In [4]:
def fetch_documents_after_date(min_published_date):
    query = {"is_premium_article": False, "published_time": {"$gt": min_published_date}, 'content_type': 'ARTICLE'}
    projection = {"_id": 1, "source_id": 1, "title": 1, "tags": 1, "cleaned_text": 1}
    collection = MongoDBArticle.get_collection()
    documents = list(collection.find(query, projection))
    return documents

In [5]:
feb_articles = fetch_documents_after_date('2024-02-01')

In [6]:
len(feb_articles)

41148

In [35]:
for x in feb_articles:
    # x.pop('_id')
    x.pop('source_id')

In [36]:
feb_articles[0]

{'cleaned_text': 'SUMMARY These proposals are in addition to the INR 22,516 Cr chip assembly plant set up by US-based memory chip maker Micron The government has given approval to the Semicon India program, allocating a total budget of INR 76,000 Cr for the development of the semiconductor The shift towards the Indian semiconductor market also appears promising, as it is anticipated to reach a market size of $55 Bn by 2026 FOLLOW US FOLLOW US Added to Saved Stories in Login VIEW SAVED STORIES The government has received four proposals for building semiconductor manufacturing plants and 13 for chip assembly units, the Parliament was informed. These proposals are in addition to the INR 22,516 Cr chip assembly plant set up by US-based memory chip maker Micron, Rajeev Chandrasekhar, Minister of State for Electronics and IT,  said in a written reply in the Lok Sabha on Wednesday (February 7). “Four proposals have been received for setting up semiconductor fabs and additional 13 proposals ha

In [57]:
prompt = '''You are to categorize financial and business news article titles into one of 5 distinct categories based on the article's primary focus and intent. This task requires understanding the context, intent behind the article, its impact, and the target audience.
1. Evaluate Context and Intent: Consider the setting and purpose of the article. Is it reporting news, offering advice, analyzing trends, or discussing impacts of policies?
2. **Determine Impact and Audience**:
    - **company_business_news**: Articles detailing initiatives and actions by companies. This includes any operational news, mergers, acquisitions, leadership changes. This is all non financial news of companies. Specific financial news of companies will come under ‘market_news’.
    - **market_news**: Articles aimed at providing updates on financial markets or insights on investment strategies, market analysis, or reviews of financial products, targeting investors or financial analysts and financial performance of companies. Targeted at active investors. 
    - **economic_policies_trends**: Articles analyzing or reporting on government policies, regulations, and their implications for the economy, businesses, or sectors, of interest to business leaders, policymakers, and informed citizens.
    - **personal_finance**: Articles offering guidance on individual financial management, such as investment for personal goals, savings, taxation, and insurance, directly targeting individual consumers and passive investors. 
    - **non_financial**: Article isn’t a financial article and talks about general politics, movies, sports or any other non financial and business news
3. **Consider the Primary Audience**:
   - Reflect on who will benefit most from the article. company_news is for industry insiders, market strategies for investors, policy impact for those interested in economic and business implications, and personal finance for the general public seeking financial advice.
Your answer must be only one of corporate_developments, market_strategies, economic_policies_trends, personal_finance, non_financial without any preamble or postamble
You are provided with 500 titles along with their article_ids in the format [{"id": "", "title": }]
answer in the format of [{"id": "", "title": "", "category": ""}]. Make sure your answer is directly the list of jsons. I want to parse it using json.loads() and it should work as expected. Also make sure to maintain the same title against the correct id
|titles|
'''

In [8]:
def mask_article_id(art_id):
    first_part = art_id[2:12]
    second_part = art_id[12:]
    masked = second_part + first_part
    return masked

In [9]:
def unmask_article_id(art_id):
    original_first_part = art_id[12:]
    original_second_part = art_id[:12]
    unmasked = '65' + original_first_part + original_second_part
    return unmasked

In [10]:
for art in feb_articles:
    art['article_id'] = str(art['_id'])
    art['masked_article_id'] = mask_article_id(art['article_id'])

In [45]:
# training_articles = np.random.choice(feb_articles, 8000)

# training_articles = list(training_articles)

# test_articles = np.random.choice([x for x in feb_articles if x not in training_articles], 2000)

# test_articles = list(test_articles)

# all_articles = list(training_articles) + list(test_articles)

In [44]:
# with open('title_category_training.json', 'w', encoding='utf-8') as f:
#     json.dump(training_articles, f)

# with open('title_category_test.json', 'w', encoding='utf-8') as f:
#     json.dump(test_articles, f)

### Creation of prompts

In [61]:
chunk_size = 100
num_chunks = int(len(all_articles)/chunk_size)

for i in range(num_chunks):
    chunk = all_articles[chunk_size*i:chunk_size*(i+1)]
    chunk_json_list = [{'id': x['article_id'], 'title': x['title']} for x in chunk]
    chunk_string = json.dumps(chunk_json_list)
    cur_chunk_propmt = prompt + chunk_string

In [62]:
cur_chunk_propmt

'You are to categorize financial and business news article titles into one of 5 distinct categories based on the article\'s primary focus and intent. This task requires understanding the context, intent behind the article, its impact, and the target audience.\n1. Evaluate Context and Intent: Consider the setting and purpose of the article. Is it reporting news, offering advice, analyzing trends, or discussing impacts of policies?\n2. **Determine Impact and Audience**:\n    - **company_business_news**: Articles detailing initiatives and actions by companies. This includes any operational news, mergers, acquisitions, leadership changes. This is all non financial news of companies. Specific financial news of companies will come under ‘market_news’.\n    - **market_news**: Articles aimed at providing updates on financial markets or insights on investment strategies, market analysis, or reviews of financial products, targeting investors or financial analysts and financial performance of com

In [63]:
from datetime import datetime
from tqdm.notebook import tqdm
import json
from openai import OpenAI

client = OpenAI(api_key='')

In [85]:
model = 'gpt-4'

In [65]:
r = client.chat.completions.create(model = 'gpt-4', messages = [{'role': 'system', 'content': prompt}, {'role': 'user', 'content': chunk_string}])

In [122]:
r.usage.completion_tokens, r.usage.prompt_tokens

(3365, 4828)

In [71]:
corrected_r = r.choices[0].message.content[:-13] + ']'

In [73]:
k = json.loads(corrected_r)

In [75]:
len(k)

66

In [78]:
chunk_dict = {x['article_id']: x for x in chunk}

In [100]:
k

[{'id': '65c32a9ac119ff6e57a4414e',
  'title': "Greaves Retail's multi-brand AutoEVMart partners with Zero21",
  'category': 'company_business_news'},
 {'id': '65cc2d1e06df7536a8f68774',
  'title': 'Barclays VP sues for $290,000, alleging discrimination',
  'category': 'company_business_news'},
 {'id': '65c605c3f4fc8477f2ed657f',
  'title': 'RBI mandates lenders to furnish Key Fact Statement: What is KFS and how will it bring transparency? Experts explain | Mint',
  'category': 'economic_policies_trends'},
 {'id': '65cd7ed5abb31841109cda5e',
  'title': 'Breaking Stigmas About STEM',
  'category': 'non_financial'},
 {'id': '65c9fa4cf4fc8477f2ed8093',
  'title': 'Uflex Standalone December 2023 Net Sales at Rs 1,648.67 crore, down 3.3% Y-o-Y',
  'category': 'market_news'},
 {'id': '65c1d2e6c119ff6e57a3f5c6',
  'title': 'mPrest Selected as DERMS Provider for EDF Renewables',
  'category': 'company_business_news'},
 {'id': '65c1d733c119ff6e57a42d90',
  'title': 'The Scotts Miracle-Gro Found

#### Validating if the mapping is correct

In [81]:
for x in k:
    i_d = x['id']
    assert chunk_dict[i_d]['title'] == x['title']

#### Trying with gpt4 turbo

In [84]:
model2 = 'gpt-4-0125-preview'

In [86]:
r_turbo = client.chat.completions.create(model = model2, messages = [{'role': 'system', 'content': prompt}, {'role': 'user', 'content': chunk_string}])

', {"id": "65c1d46ac119ff6e57a40942", "title": "Meta Reports Fourth Quarter and Full Year 2023'

In [95]:
r_turbo.choices[0].message.content[-93:] + ']'

', {"id": "65c1d46ac119ff6e57a40942", "title": "Meta Reports Fourth Quarter and Full Year 2023]'

In [99]:
k_turbo = json.loads(r_turbo.choices[0].message.content[:-93] + ']')

In [101]:
k_turbo

[{'id': '65c32a9ac119ff6e57a4414e',
  'title': "Greaves Retail's multi-brand AutoEVMart partners with Zero21",
  'category': 'company_business_news'},
 {'id': '65cc2d1e06df7536a8f68774',
  'title': 'Barclays VP sues for $290,000, alleging discrimination',
  'category': 'company_business_news'},
 {'id': '65c605c3f4fc8477f2ed657f',
  'title': 'RBI mandates lenders to furnish Key Fact Statement: What is KFS and how will it bring transparency? Experts explain | Mint',
  'category': 'economic_policies_trends'},
 {'id': '65cd7ed5abb31841109cda5e',
  'title': 'Breaking Stigmas About STEM',
  'category': 'non_financial'},
 {'id': '65c9fa4cf4fc8477f2ed8093',
  'title': 'Uflex Standalone December 2023 Net Sales at Rs 1,648.67 crore, down 3.3% Y-o-Y',
  'category': 'market_news'},
 {'id': '65c1d2e6c119ff6e57a3f5c6',
  'title': 'mPrest Selected as DERMS Provider for EDF Renewables',
  'category': 'company_business_news'},
 {'id': '65c1d733c119ff6e57a42d90',
  'title': 'The Scotts Miracle-Gro Found

In [102]:
df = pd.DataFrame(k)

In [103]:
df_turbo = pd.DataFrame(k_turbo)

In [109]:
df_turbo.rename(columns = {'category': 'cat_turbo'})

Unnamed: 0,id,title,cat_turbo
0,65c32a9ac119ff6e57a4414e,Greaves Retail's multi-brand AutoEVMart partne...,company_business_news
1,65cc2d1e06df7536a8f68774,"Barclays VP sues for $290,000, alleging discri...",company_business_news
2,65c605c3f4fc8477f2ed657f,RBI mandates lenders to furnish Key Fact State...,economic_policies_trends
3,65cd7ed5abb31841109cda5e,Breaking Stigmas About STEM,non_financial
4,65c9fa4cf4fc8477f2ed8093,Uflex Standalone December 2023 Net Sales at Rs...,market_news
...,...,...,...
75,65c595f061e5050757966e5b,Squarespace (SQSP) Stock Sinks As Market Gains...,market_news
76,65cadbbbf4fc8477f2ed881c,"Elon Musk Wanted To Sell Tesla To Apple, But C...",non_financial
77,65c211e1c119ff6e57a436dc,Zerodha AMC's LIQUIDCASE hits ₹150 crore AUM i...,personal_finance
78,65c60606f4fc8477f2ed67f1,MAXIMA GRUPĖ receives SBTi validation for its ...,company_business_news


In [110]:
df_merged = pd.merge(df, df_turbo.rename(columns = {'category': 'cat_turbo'}), how = 'outer', on = ['id', 'title'])

In [112]:
df_merged.groupby('cat_turbo')['id'].count()

cat_turbo
company_business_news       17
economic_policies_trends     4
market_news                 49
non_financial                5
personal_finance             5
Name: id, dtype: int64

In [114]:
df_merged['different'] = df_merged['category'] != df_merged['cat_turbo']

In [118]:
df_merged[df_merged.different == True].to_dict('records')

[{'id': '65cc2d5106df7536a8f68a0e',
  'title': 'Millions struggle with student loan repayments amid stubborn inflation',
  'category': 'personal_finance',
  'cat_turbo': 'economic_policies_trends',
  'different': True},
 {'id': '65cc2d0506df7536a8f6862e',
  'title': 'The Fashion and Management Missteps That Left Express Clinging to Solvency',
  'category': 'company_business_news',
  'cat_turbo': 'market_news',
  'different': True},
 {'id': '65c6e744f4fc8477f2ed6e5e',
  'title': 'How to Boost Your Portfolio with Top Oils-Energy Stocks Set to Beat Earnings',
  'category': 'market_news',
  'cat_turbo': 'personal_finance',
  'different': True},
 {'id': '65ced06fabb31841109cee87',
  'title': 'ExxonMobil Obtains Exemption from Canadian Oil and Gas Reporting Obligations',
  'category': 'economic_policies_trends',
  'cat_turbo': 'market_news',
  'different': True},
 {'id': '65c1d759c119ff6e57a42f8c',
  'title': 'AM Best Places Credit Ratings for Several Subsidiaries of The Cigna Group Under Re

- Observing that GPT4 is giving better responses. Hence going with it

In [124]:
def fetch_title_classes(chunk_str, chunk_index):
    r_cur = client.chat.completions.create(model = 'gpt-4', messages = [{'role': 'system', 'content': prompt}, {'role': 'user', 'content': chunk_str}])
    res = r_cur.choices[0].message.content
    try:
        parsed_res = json.loads(res)
    except:
        parsed_res = None
    completion_tokens = r_cur.usage.completion_tokens
    prompt_tokens = r_cur.usage.prompt_tokens
    return chunk_index, res, parsed_res, completion_tokens, prompt_tokens

In [130]:
a,b,c,d,e = fetch_title_classes(chunk_str=chunk_string,chunk_index=99)

In [134]:
e

2641

In [119]:
shortlisted_articles = list(np.random.choice(all_articles, 5000))

In [138]:
chunk_size = 50
num_chunks = int(len(shortlisted_articles)/chunk_size)
chunk_str_dict = {}
for i in range(num_chunks):
    chunk = shortlisted_articles[chunk_size*i:chunk_size*(i+1)]
    chunk_json_list = [{'id': x['article_id'], 'title': x['title']} for x in chunk]
    chunk_string = json.dumps(chunk_json_list)
    chunk_str_dict[i] = chunk_string

In [140]:
title_class_results = {}

In [142]:
import time

In [144]:
s = time.time()
with ThreadPoolExecutor(max_workers=15) as executor:
    futures = [executor.submit(fetch_title_classes, chunkstr, ind) for ind, chunkstr in chunk_str_dict.items()]
    for future in as_completed(futures):
        index, response, parsed_response, num_completion_tokens, num_prompt_tokens = future.result()
        title_class_results[index] = {'res': response, 'parsed_res': parsed_response, 'com_tokens': num_completion_tokens, 'prompt_tokens': num_prompt_tokens}
        print(f'done with index {index} in {time.time() - s} seconds')
    print(f'done in {time.time() - s} seconds')

done with index 4 in 76.01502013206482 seconds
done with index 3 in 76.37697315216064 seconds
done with index 5 in 80.05090093612671 seconds
done with index 1 in 80.75662994384766 seconds
done with index 7 in 94.20979404449463 seconds
done with index 2 in 94.9927008152008 seconds
done with index 14 in 97.59425711631775 seconds
done with index 13 in 97.93184208869934 seconds
done with index 12 in 101.55906510353088 seconds
done with index 11 in 101.64088296890259 seconds
done with index 9 in 110.98307204246521 seconds
done with index 0 in 125.1488950252533 seconds
done with index 6 in 131.43558812141418 seconds
done with index 8 in 134.0529909133911 seconds
done with index 15 in 160.98313283920288 seconds
done with index 18 in 162.55887413024902 seconds
done with index 19 in 180.1961431503296 seconds
done with index 20 in 180.27190494537354 seconds
done with index 25 in 194.939532995224 seconds
done with index 10 in 197.22592616081238 seconds
done with index 17 in 202.9684669971466 seco

NameError: name 'start_time' is not defined

In [145]:
len(title_class_results)

100

In [152]:
path = '/Users/username/Downloads/swift_thinking/emails'

In [163]:
shortlisted_parsed_results = []

In [164]:
for ind, results in title_class_results.items():
    if not results['parsed_res']:
        print(f'not valid json for {ind}')
    else:
        shortlisted_parsed_results.extend(results['parsed_res'])

In [165]:
len(shortlisted_parsed_results)

4985

In [166]:
shortlisted_parsed_results

[{'id': '65c32ac0c119ff6e57a441f5',
  'title': 'Nykredit today announces the Annual Reports for 2023 - Nykredit Realkredit A/S',
  'category': 'company_business_news'},
 {'id': '65c1d469c119ff6e57a40934',
  'title': 'Crown Realty Partners Expands Portfolio With Acquisition of Atrium Building in Mississauga',
  'category': 'company_business_news'},
 {'id': '65cd7eb7abb31841109cd8e5',
  'title': 'Cennox announces a nationwide service contract with FLO, to support reliability in EV charging infrastructure across the US',
  'category': 'company_business_news'},
 {'id': '65c989cbf4fc8477f2ed78f6',
  'title': 'NYKAA Share Price Live blog for 12 Feb 2024 | Mint',
  'category': 'market_news'},
 {'id': '65c4b45e61e5050757966385',
  'title': 'India ranks fourth in the world in Renewable Energy Installed Capacity says PM',
  'category': 'economic_policies_trends'},
 {'id': '65cadb92f4fc8477f2ed860d',
  'title': 'Roc Nation Lands A Multi-Year Deal With Versace That Will Provide Event Sponsorships 

In [168]:
shortlisted_articles_dict = {x['article_id']: x['title'] for x in shortlisted_articles}

### Hallucinations of the ids

In [172]:
wrong_article_ids = []
wrong_titles = []

In [173]:
for x in shortlisted_parsed_results:
    if x['id'] not in shortlisted_articles_dict:
        wrong_article_ids.append(x['id'])
        continue
    if x['title'] != shortlisted_articles_dict[x['id']]:
        wrong_titles.append(x['id'])

In [189]:
shortlisted_parsed_results[0]

{'id': '65c32ac0c119ff6e57a441f5',
 'title': 'Nykredit today announces the Annual Reports for 2023 - Nykredit Realkredit A/S',
 'category': 'company_business_news'}

In [174]:
len(wrong_article_ids)

5

In [175]:
len(wrong_titles)

2

In [179]:
invalid_article_ids = wrong_article_ids + wrong_titles

In [180]:
results_df = pd.DataFrame(shortlisted_parsed_results)

In [181]:
results_df = results_df[~results_df.id.isin(invalid_article_ids)]

In [184]:
results_df['test_flag'] = [1 if np.random.rand() <= 0.2 else 0 for _ in range(len(results_df)) ]

In [188]:
results_df.to_csv('titles_classes_gpt4_results.csv',index = False)

In [187]:
results_df.groupby(['test_flag', 'category'])['category'].count()

test_flag  category                
0          company_business_news       1407
           economic_policies_trends     530
           market_news                 1692
           non_financial                188
           personal_finance             146
1          company_business_news        340
           economic_policies_trends     115
           market_news                  450
           non_financial                 61
           personal_finance              44
Name: category, dtype: int64

#### Total cost

In [193]:
total_prompts = 0
total_completions = 0
for ind, results in title_class_results.items():
    total_prompts += results['prompt_tokens'] 
    total_completions += results['com_tokens'] 

In [195]:
total_cost = (0.06 * total_completions/1000) + (0.03 * total_prompts/1000)

In [196]:
total_cost

23.13195