In [1]:
parent_folder = '/Users/ravi.tej/Desktop/ML/Recommendations/arcane/'
from hydra import compose, initialize
import os

import xml.etree.ElementTree as ET

tree = ET.parse('../../../conf/application.run.xml')
root = tree.getroot()

envs_element = root.find('./configuration/envs')
for variable in envs_element.findall('env'):
    name = variable.get('name')
    value = variable.get('value')
    os.environ[name] = value

import sys
sys.path.append('/Users/ravi.tej/Desktop/ML/Recommendations/arcane/')
import pandas as pd

os.environ['PATH'] = '/Users/ravi.tej/anaconda3/envs/bertopicenv/bin:/Users/ravi.tej/anaconda3/condabin:/usr/bin:/bin:/usr/sbin:/sbin'

from src.articles.SummaryService import SummaryService

from src.llm_finetuning.ResponseCleanerService import ResponseCleanerService

In [2]:
from sql.articles.MongoDBArticle import MongoDBArticle

In [3]:
from datetime import datetime

In [4]:
def fetch_summarized_articles(limit=None):
    query = {
        "is_premium_article": False,
        "content_type": 'ARTICLE',
        "cleaned_text": {"$exists": True, "$ne": None},
        "ai_generated_info.summary.OpenHermes_WatermelonUvulaMarigold": {"$exists": True},  # Check for the existence of the special_key,
        "ai_generated_info.summary.OpenHermes_WatermelonUvulaMarigold.generated_at": {"$lt": datetime(2024, 2, 27, 9, 0, 0)}
    }
    collection = MongoDBArticle.get_collection()
    if isinstance(limit, int):
        documents = list(collection.find(query).limit(limit))
    else:
        documents = list(collection.find(query))
    for doc in documents:
        doc['article_id'] = str(doc['_id'])
        doc['source_id'] = str(doc['source_id'])
    return documents

In [5]:
summarized_docs = fetch_summarized_articles()

In [6]:
len(summarized_docs)

0

In [7]:
import re

def convert_to_dicts(input_str):
    # Regular expression to match each point
    point_pattern = re.compile(r'!(?P<emoji>.+?)! (?P<label>[^:!]+)(?:\: (?P<point>[^!]+))?')
    dicts = []

    for match in point_pattern.finditer(input_str):
        emoji = match.group('emoji').strip()
        label = match.group('label').strip()
        point_text = match.group('point').strip() if match.group('point') else ""

        # Validate emoji and ensure label and point are not empty
        if not any(ord(char) > 127 for char in emoji):
            return []
        if not label:
            return []
        if not point_text:
            return []

        dicts.append({'emoji': emoji, 'label': label, 'point': point_text})

    return dicts

input_str = "!💼! Company Performance: Nvidia's revenue soared by 265% YoY, causing a 9% surge in its shares; !📉! Market Share: Nvidia's Chinese market now contributes a mid-single digit percentage to its data center revenue; !⚠️! Export Controls: U.S. export restrictions on advanced semiconductors have forced Nvidia to halt product offerings in China, leading to a dip in its Chinese data center revenue."
dicts = convert_to_dicts(input_str)
print(dicts)

[{'emoji': '💼', 'label': 'Company Performance', 'point': "Nvidia's revenue soared by 265% YoY, causing a 9% surge in its shares;"}, {'emoji': '📉', 'label': 'Market Share', 'point': "Nvidia's Chinese market now contributes a mid-single digit percentage to its data center revenue;"}, {'emoji': '⚠️', 'label': 'Export Controls', 'point': 'U.S. export restrictions on advanced semiconductors have forced Nvidia to halt product offerings in China, leading to a dip in its Chinese data center revenue.'}]


In [8]:
import json

In [9]:
def extract_new_cleaned_summary_for_doc(doc):
    original_string = ResponseCleanerService.convert_summary_dict_to_str_format(json.loads(doc['ai_generated_info']['summary']['OpenHermes_WatermelonUvulaMarigold']['value']))
    new_summary = convert_to_dicts(original_string)
    assert type(new_summary) == list
    if new_summary:
        assert len(new_summary) >= 1
    return new_summary

In [10]:
bad_cases = []

In [11]:
new_summaries = {}

In [12]:
headlines = {str(x['_id']): x['ai_generated_info']['title']['OpenHermes_WatermelonUvulaMarigold']['value'] for x in summarized_docs}

In [13]:
for i in range(len(summarized_docs)):
    art_id = str(summarized_docs[i]['_id'])
    try:
        new_summaries[art_id] = extract_new_cleaned_summary_for_doc(summarized_docs[i])
    except:
        bad_cases.append(i)

In [14]:
new_summaries['6536802a1e5cc42b1b1441a2']

[{'emoji': '🚗',
  'label': 'Renewal Process',
  'point': 'Renewing car insurance involves evaluating your policy, comparing prices, updating details, paying the premium, and receiving a renewal confirmation. Documents required include address proof, government ID proof, vehicle registration number, copy of registration certificate, pollution test certificate, card details for online payment, and existing policy number.'},
 {'emoji': '💼',
  'label': 'Importance',
  'point': 'Renewing car insurance ensures continuous coverage, legal compliance, financial safety, access to no-claim bonus, customizable coverage options, and opportunities for discounts.'},
 {'emoji': '🔎',
  'label': 'How it Works',
  'point': 'Car insurance renewals can be completed online or offline and usually offer benefits like no-claim bonuses and discounts.'}]

In [15]:
from tqdm.notebook import tqdm

In [16]:
llm_finetune_id = 'OpenHermes_WatermelonUvulaMarigold'

In [17]:
from src.articles.ArticleAttributesService import ArticleAttributesService

In [18]:
def update_response(art_id):
    new_summary = json.dumps(new_summaries[art_id])
    ArticleAttributesService.save_summary_to_mongo(article_id=art_id,
                                                       article_summary=new_summary,
                                                       article_title=headlines[art_id],
                                                       finetune_id=llm_finetune_id)

In [20]:
from concurrent.futures import ThreadPoolExecutor, as_completed

chunk_size = 40
num_chunks = len(new_summaries)//chunk_size + 1

In [21]:
pending_articles = list(new_summaries.keys())

In [22]:
import time

In [23]:
from tqdm.notebook import tqdm

In [24]:
for i in tqdm(range(10)):
    pass

  0%|          | 0/10 [00:00<?, ?it/s]

In [25]:
for i in tqdm(range(num_chunks)):
    article_chunk = pending_articles[i*chunk_size:(i+1)*chunk_size]
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(update_response, art_id) for art_id in article_chunk]
        for future in as_completed(futures):
            res = future.result()
    print(f'completed chunk {(i + 1) * chunk_size} articles at {"{:%b %d, %Y %H:%M}".format(datetime.now())} in {int(time.time() - start_time)/60} minutes')

  0%|          | 0/735 [00:00<?, ?it/s]

completed chunk 40 articles at Feb 27, 2024 15:18 in 0.03333333333333333 minutes
completed chunk 80 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 120 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 160 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 200 articles at Feb 27, 2024 15:18 in 0.03333333333333333 minutes
completed chunk 240 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 280 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 320 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 360 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 400 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 440 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
completed chunk 480 articles at Feb 27, 2024 15:18 in 0.016666666666666666 minutes
complete

In [None]:
with ThreadPoolExecutor(max_workers=10) as executor:
    

<Response [200]>

In [51]:
len(new_summaries)

33610

In [54]:
art_id

'65d1e32fabb31841109d0bd8'

In [55]:
completed = 0
for i in new_summaries:
    if i == art_id:
        break
    else:
        completed += 1

In [1]:
from tqdm import tqdm

In [56]:
completed

1173

In [43]:
new_summary

'[{"emoji": "\\ud83d\\udcb0", "label": "Equity dominance", "point": "Individual investors hold 88% of their assets in equity schemes, as per Jan AMFI data. Institutional investors dominate liquid and debt schemes."}, {"emoji": "\\ud83d\\udcc8", "label": "Asset surge", "point": "Total assets saw a significant 29.62% increase, with individual investors experiencing a 35.92% jump. Institutions saw a 17.42% rise."}, {"emoji": "\\ud83d\\udcda", "label": "Equity fund dominance", "point": "Equity funds have emerged as dominant among total mutual fund assets, with a 56.9% share in Jan 2024."}]'

In [36]:
len(bad_cases)

0

In [None]:
summ = '[{"emoji": "\ud83d\udcbc", "label": "Business", "point": "GPT Healthcare IPO opened on 22nd February 2024 and will remain open till 26th February 2024; !\ud83d\udcc8! Finance: The company has fixed the issue price at \u20b9177 to \u20b9186 per equity share; !\ud83d\udcb9! Market Performance: After two days of bidding, the public issue has been subscribed 0.85 times; the grey market premium is currently \u20b913. !\ud83d\udca1! Analyst View: Brokerage firms view GPT Healthcare positively due to its strong positioning in underserved markets and potential growth."}]'

In [None]:
import json

In [None]:
t = json.loads(summ)

In [None]:
t[0]

{'emoji': '\ud83d\udcbc',
 'label': 'Business',
 'point': 'GPT Healthcare IPO opened on 22nd February 2024 and will remain open till 26th February 2024; !\ud83d\udcc8! Finance: The company has fixed the issue price at ₹177 to ₹186 per equity share; !\ud83d\udcb9! Market Performance: After two days of bidding, the public issue has been subscribed 0.85 times; the grey market premium is currently ₹13. !\ud83d\udca1! Analyst View: Brokerage firms view GPT Healthcare positively due to its strong positioning in underserved markets and potential growth.'}

In [None]:
string = f"!{t[0]['emoji']}! {t[0]['label']}: {t[0]['point']}"

In [None]:
string

'!\ud83d\udcbc! Business: GPT Healthcare IPO opened on 22nd February 2024 and will remain open till 26th February 2024; !\ud83d\udcc8! Finance: The company has fixed the issue price at ₹177 to ₹186 per equity share; !\ud83d\udcb9! Market Performance: After two days of bidding, the public issue has been subscribed 0.85 times; the grey market premium is currently ₹13. !\ud83d\udca1! Analyst View: Brokerage firms view GPT Healthcare positively due to its strong positioning in underserved markets and potential growth.'

In [12]:
def convert_summary_to_dicts(s):
    # Split the string into parts based on "\n" or ";" to get each point
    points = re.split(r'\n|; ', s)
    result = []

    for point in points:
        # Split each point into emoji and the rest based on "!!"
        emoji_split = point.split("!", 2)
        if len(emoji_split) < 3:
            continue  # Skip if the format does not match expected

        emoji, rest = emoji_split[1], emoji_split[2]
        # Split the rest into label and point based on the first ":"
        label_point_split = rest.split(":", 1)
        if len(label_point_split) < 2:
            continue  # Skip if the format does not match expected

        label, point = label_point_split[0].strip(), label_point_split[1].strip()
        # Append the result as a dict to the result list
        result.append({"emoji": emoji, "label": label, "point": point})

    return result

# Test the function with the original string
summary = '!\ud83d\udcbc! Business: GPT Healthcare IPO opened on 22nd February 2024 and will remain open till 26th February 2024; \n !\ud83d\udcc8! Finance: The company has fixed the issue price at ₹177 to ₹186 per equity share; \n !\ud83d\udcb9! Market Performance: After two days of bidding, the public issue has been subscribed 0.85 times \n !\ud83d\udca1! Analyst View: Brokerage firms view GPT Healthcare positively due to its strong positioning in underserved markets and potential growth.'
result = convert_summary_to_dicts(summary)
print(result)

[{'emoji': '\ud83d\udcbc', 'label': 'Business', 'point': 'GPT Healthcare IPO opened on 22nd February 2024 and will remain open till 26th February 2024'}, {'emoji': '\ud83d\udcc8', 'label': 'Finance', 'point': 'The company has fixed the issue price at ₹177 to ₹186 per equity share'}, {'emoji': '\ud83d\udcb9', 'label': 'Market Performance', 'point': 'After two days of bidding, the public issue has been subscribed 0.85 times'}, {'emoji': '\ud83d\udca1', 'label': 'Analyst View', 'point': 'Brokerage firms view GPT Healthcare positively due to its strong positioning in underserved markets and potential growth.'}]


In [13]:
k = ResponseCleanerService.convert_summary_dict_to_str_format(result)

In [14]:
k

'!\ud83d\udcbc! Business: GPT Healthcare IPO opened on 22nd February 2024 and will remain open till 26th February 2024 \n !\ud83d\udcc8! Finance: The company has fixed the issue price at ₹177 to ₹186 per equity share \n !\ud83d\udcb9! Market Performance: After two days of bidding, the public issue has been subscribed 0.85 times \n !\ud83d\udca1! Analyst View: Brokerage firms view GPT Healthcare positively due to its strong positioning in underserved markets and potential growth.'

In [16]:
from src.llm_finetuning.ResponseCleanerService import ResponseCleanerService

In [15]:
ResponseCleanerService.validate_summary_emojis(k)

True

### Current summarized docs

In [147]:
bad_cases[4]

63

In [None]:
convert_summary_to_dicts("!💊! Key Details: Asieris, Photocure's partner, is to reveal the first outcomes of the international multicenter Phase III clinical study for Cevira, a non-surgical treatment for cervical HSIL at the 2024 EUROGIN; The presentation will be an oral one at the conference; Cevira, a groundbreaking photodynamic drug-device combination product, is in development for non-surgical treatment of high-grade cervical pre-cancer lesions.")

[{'emoji': '💊',
  'label': 'Key Details',
  'point': "Asieris, Photocure's partner, is to reveal the first outcomes of the international multicenter Phase III clinical study for Cevira, a non-surgical treatment for cervical HSIL at the 2024 EUROGIN"}]

In [None]:
ResponseCleanerService.convert_summary_dict_to_str_format(json.loads(summarized_docs[16]['ai_generated_info']['summary']['OpenHermes_WatermelonUvulaMarigold']['value']))

"!💊! Key Details: Asieris, Photocure's partner, is to reveal the first outcomes of the international multicenter Phase III clinical study for Cevira, a non-surgical treatment for cervical HSIL at the 2024 EUROGIN; The presentation will be an oral one at the conference; Cevira, a groundbreaking photodynamic drug-device combination product, is in development for non-surgical treatment of high-grade cervical pre-cancer lesions."

In [None]:
len(bad_cases)

1729

In [None]:
convert_to_dicts("!💼! Company Performance: Nvidia's revenue soared by 265% YoY, causing a 9% surge in its shares; !📉! Market Share: Nvidia's Chinese market now contributes a mid-single digit percentage to its data center revenue; !⚠️! Export Controls: U.S. export restrictions on advanced semiconductors have forced Nvidia to halt product offerings in China, leading to a dip in its Chinese data center revenue.")

[{'emoji': '💼',
  'label': 'Company Performance',
  'point': "Nvidia's revenue soared by 265% YoY, causing a 9% surge in its shares; !📉! Market Share: Nvidia's Chinese market now contributes a mid-single digit percentage to its data center revenue; !⚠️! Export Controls: U.S. export restrictions on advanced semiconductors have forced Nvidia to halt product offerings in China, leading to a dip in its Chinese data center revenue."}]

In [None]:
summarized_docs[2]['ai_generated_info']['summary']['OpenHermes_WatermelonUvulaMarigold']['value']

'[{"emoji": "\\ud83d\\udcbc", "label": "Business", "point": "Interiors and More IPO allotment finalised; check status on Bigshare Services."}, {"emoji": "\\ud83d\\udcc8", "label": "Finance", "point": "IPO subscribed 11.22 times, raising \\u20b911.94 crore from anchor investors."}, {"emoji": "\\ud83d\\udcca", "label": "Market", "point": "Listing price estimated at \\u20b9250 per share after grey market premium."}]'

In [None]:
ResponseCleanerService.convert_summary_dict_to_str_format(extract_new_cleaned_summary_for_doc(summarized_docs[100]))

"!🏠! Project Update: The Greater Noida Industrial Development Authority (GNIDA) has enabled the registration of 6,500 stalled housing project flats. \n !💰! Financial Details: Developers of nearly 30 stuck projects have agreed to pay 25% of their total dues, amounting to Rs 350 crore, to become eligible for the registry. \n !🔍! Package Details: The Uttar Pradesh government's rehabilitation package offers relief from penalties and interest on the unpaid dues. Registration process is expected to be completed in 1-2 months."

In [None]:
summarized_docs

[{'_id': ObjectId('65d5f3cd77ebd656734cf398'),
  'url': 'https://www.livemint.com/industry/agriculture/farmers-protest-msp-law-diversification-farmers-will-shift-from-paddy-wheat-only-when-agri-expert-explain-11708516205860.html',
  'published_time': '2024-02-21T18:10:26+05:30',
  'last_updated_time': '2024-02-21T18:10:26+05:30',
  'source_id': '650046bd005149c49201269f',
  'image_url': 'https://www.livemint.com/lm-img/img/2024/02/21/1600x900/Farmer-leader-and-National-spokesperson-of-the-Bha_1708516336286_1708516346502.jpg',
  'authors': ['Akriti Anand'],
  'reactions': {},
  'content_type': 'ARTICLE',
  '_class': 'com.wintWealth.insight.common.models.entities.Article',
  'cleaned_text': 'Farmer\'s demand for legal guarantee for Minimum Support Price (MSP) on all 23 crops sparked debate around crop diversification and its benefits. SKM (Non-Political) leader Jagjit Singh Dallewal said the Centre\'s proposal of buying five crops at the MSP will only be meant for those who go for crop d