In [1]:
import boto3
import pandas as pd
from bertopic import BERTopic

parent_folder = '/Users/username/Desktop/ML/Recommendations/arcane/'
from hydra import compose, initialize
import os

import xml.etree.ElementTree as ET

tree = ET.parse('../../conf/application.run.xml')
root = tree.getroot()

envs_element = root.find('./configuration/envs')
for variable in envs_element.findall('env'):
    name = variable.get('name')
    value = variable.get('value')
    os.environ[name] = value

import sys
sys.path.append('/Users/username/Desktop/ML/Recommendations/arcane/')

from src._utils import load_bertopic_model_from_hf

import json
from io import StringIO
import pickle
from smart_open import open
from bertopic.backend._sentencetransformers import SentenceTransformerBackend

In [356]:
import requests

In [362]:
r = requests.post('http://Arcane-env.eba-mrsaixmg.ap-south-1.elasticbeanstalk.com/generate_new_clusters_to_s3')

In [360]:
r.text

'<html>\r\n<head><title>504 Gateway Time-out</title></head>\r\n<body>\r\n<center><h1>504 Gateway Time-out</h1></center>\r\n<hr><center>nginx</center>\r\n</body>\r\n</html>\r\n'

In [2]:
from src.articles.ArticleService import ArticleService

In [3]:
s3_bucket = 'insight-ml-models'
s3_bertopic_folder = 'bertopic'

In [451]:
run_id = 'YellowLicoriceXray'

In [5]:
def load_bertopic_model_from_s3(run_id: str, hf_embedding_model_name: str) -> BERTopic:
    filename = 'BERTopic_' + run_id
    foldername = run_id
    s3_path = f's3://{s3_bucket}/{s3_bertopic_folder}/{foldername}/{filename}'

    # Stream the model directly from S3
    with open(s3_path, 'rb',
              transport_params={'client': boto3.client('s3',
                                                       aws_access_key_id=os.environ['AWS_ACCESS_KEY'],
                                                       aws_secret_access_key=os.environ['AWS_SECRET_KEY'])}) as f:
        model = pickle.load(f)
    print('bertopic streamed successfully')

    assert isinstance(model, BERTopic), f"Failed to load model from S3. Model {filename} is not of type BERTopic."
    model.embedding_model = SentenceTransformerBackend(embedding_model=hf_embedding_model_name)
    return model

In [6]:
def load_json_from_s3(run_id: str, json_file_name: str) -> BERTopic:
    foldername = run_id
    s3_resource = boto3.resource('s3', aws_access_key_id=os.environ['AWS_ACCESS_KEY'], aws_secret_access_key=os.environ['AWS_SECRET_KEY'])
    response = s3_resource.Object(s3_bucket, f'{s3_bertopic_folder}/{foldername}/{json_file_name}.json').get()
    return json.loads(response['Body'].read().decode('utf-8'))
    # return json.dumps(response)

In [7]:
def load_csv_from_s3(run_id: str, csv_file_name: str) -> BERTopic:
    foldername = run_id
    s3_resource = boto3.resource('s3', aws_access_key_id=os.environ['AWS_ACCESS_KEY'], aws_secret_access_key=os.environ['AWS_SECRET_KEY'])
    response = s3_resource.Object(s3_bucket, f'{s3_bertopic_folder}/{foldername}/{csv_file_name}.csv').get()
    csv_string = response['Body'].read().decode('utf-8')
    return pd.read_csv(StringIO(csv_string))
    # return json.dumps(response)

In [452]:
article_story_cluster_mapping = load_json_from_s3(run_id = run_id, json_file_name='article_story_cluster_mapping')
story_cluster_mapping = load_json_from_s3(run_id = run_id, json_file_name='story_cluster_mapping')
story_embeddings = load_json_from_s3(run_id = run_id, json_file_name='story_embeddings')
df = load_csv_from_s3(run_id=run_id, csv_file_name='cluster_hierarchy')

In [453]:
clusters = list(set(story_cluster_mapping.values()))

In [454]:
outliers = {}
for article_id in article_story_cluster_mapping:
    if article_story_cluster_mapping[article_id]['storyline_id'] == -1:
        outliers.setdefault(article_story_cluster_mapping[article_id]['cluster_id'], []).append(article_id)

In [455]:
cluster_count = {}
for cluster in outliers:
    cluster_count[cluster] = len(outliers[cluster])

In [456]:
outliers_count_df = pd.DataFrame([(key, value) for key, value in cluster_count.items()], columns = ['cluster_id', 'outlier_count'])

In [457]:
df['parent_id'] = df['parent_id'].astype('int')

In [458]:
df = pd.merge(df, outliers_count_df, how = 'left', left_on = 'parent_id', right_on = 'cluster_id')

In [459]:
df['total_count'] = df['num_docs'] + df['outlier_count']
df['outlier_percent'] = df['outlier_count']/df['total_count']

In [460]:
df[df.cluster_id.notnull()].sort_values('total_count', ascending = False)

Unnamed: 0.1,Unnamed: 0,parent_id,parent_name,child_storyline_list,child_left_id,child_left_name,child_right_id,child_right_name,distance,level,num_docs,doc_list,cluster_id,outlier_count,total_count,outlier_percent
28,28,254,target rs_target_pharma_results_profit,"[8, 9, 12, 26, 29, 38, 48, 55, 57, 63, 65, 86,...",160.0,finance_bajaj finance_bajaj_finance share_hous...,251.0,target rs_target_pharma_results_dividend,1.089306,6.0,9259.0,"[308, 1108, 2227, 2881, 3390, 3692, 5023, 5492...",254.0,15199.0,24458.0,0.621433
26,26,256,mind_mind markets_innerworth mind_innerworth_c...,"[5, 24, 31, 39, 49, 52, 56, 61, 72, 76, 78, 89...",249.0,innerworth_innerworth mind_mind markets_mind_c...,214.0,job_retirement_money_work_people,1.100493,10.0,7517.0,"[83, 444, 546, 609, 3018, 3499, 3709, 3884, 41...",256.0,7897.0,15414.0,0.512326
9,9,273,insurance_rbi_tax_rates_credit,"[6, 18, 19, 25, 34, 37, 41, 47, 59, 62, 66, 82...",260.0,rbi_rates_credit_loan_mortgage,268.0,insurance_tax_gst_income tax_income,1.282598,5.0,8049.0,"[67, 127, 521, 1179, 1568, 1870, 2438, 4065, 4...",273.0,3921.0,11970.0,0.327569
142,142,0,0_2023 net_net sales_sales rs_sales,[],,,,,,4.0,11562.0,"[12, 25, 41, 42, 48, 101, 103, 123, 136, 146, ...",0.0,2.0,11564.0,0.000173
29,29,253,ipo_sebi_gmp_status_allotment,"[1, 11, 15, 22, 44, 51, 69, 74, 77, 79, 110, 1...",252.0,ipo_gmp_status_allotment_coal,182.0,sebi_zee_mint sebi_entertainment_zee entertain...,1.081899,12.0,8766.0,"[445, 856, 973, 1350, 1474, 1976, 2079, 2148, ...",253.0,1825.0,10591.0,0.172316
15,15,267,oil_china_oil prices_pm_prices,"[7, 16, 33, 35, 36, 87, 109, 117]",262.0,oil_china_oil prices_prices_opec,191.0,pm_modi_g20_pm modi_summit,1.21484,8.0,4670.0,"[56, 238, 286, 294, 364, 794, 841, 1260, 1444,...",267.0,2496.0,7166.0,0.348311
4,4,278,mutual_funds_fund_definition_mutual funds,"[13, 30, 43, 50, 53, 98, 129, 132, 137]",195.0,mutual_funds_fund_mutual funds_mutual fund,184.0,definition_example_examples_formula_trading,1.414653,3.0,3716.0,"[68, 326, 1128, 1212, 3348, 3726, 4136, 4412, ...",278.0,3403.0,7119.0,0.478017
39,39,243,air_air india_musk_tesla_byju,"[3, 14, 32, 45, 68, 70, 71, 103, 114]",238.0,air_air india_byju_indigo_flight,172.0,tesla_ev_electric_ford_ola,1.031884,12.0,5420.0,"[30, 248, 537, 2221, 3618, 3840, 3886, 5057, 5...",243.0,673.0,6093.0,0.110455
11,11,271,dollar_gold_fed_rupee_rate,"[10, 27, 58, 64, 81, 88, 95, 120, 121, 128, 139]",250.0,dollar_rupee_fed_inflation_paise,171.0,gold_silver_mint gold_gold silver_prices,1.267402,2.0,4044.0,"[20, 398, 612, 669, 840, 872, 1002, 1187, 1299...",271.0,451.0,4495.0,0.100334
24,24,258,adani_bank_hdfc_stocks watch_mint adani,"[17, 21, 42, 73, 75, 80, 101, 135, 141]",247.0,adani_stocks watch_jio_mint adani_adani ports,187.0,bank_hdfc_hdfc bank_bank target_kotak,1.112572,6.0,3694.0,"[164, 440, 482, 616, 721, 747, 1132, 1468, 150...",258.0,287.0,3981.0,0.072092


### Identifying stories which should not be part of the clusters

In [222]:
df[df.parent_id.isin([17, 41, 51, 179, 310, 345, 361])]

Unnamed: 0.1,Unnamed: 0,parent_id,parent_name,child_storyline_list,child_left_id,child_left_name,child_right_id,child_right_name,distance,level,num_docs,doc_list,cluster_id,outlier_count,total_count,outlier_percent
405,405,17,17_daily voice_voice_daily_manager,[],,,,,,18.0,539.0,"[756, 1121, 1350, 1365, 1407, 1502, 1858, 1948...",,,,
429,429,41,41_jpmorgan_republic_goldman_svb,[],,,,,,18.0,317.0,"[113, 306, 1865, 2060, 2445, 2498, 2760, 2910,...",,,,
439,439,51,51_cnbc daily_daily open_daily_cnbc,[],,,,,,18.0,289.0,"[106, 183, 298, 366, 2249, 2943, 3769, 4538, 4...",,,,
567,567,179,179_technical view_view_view nifty_technical,[],,,,,,18.0,128.0,"[938, 1027, 2134, 2320, 2483, 3238, 4333, 7776...",,,,
698,698,310,310_manual_manual nifty_nifty_strike manual,[],,,,,,18.0,69.0,"[2014, 2759, 3055, 13200, 15411, 22807, 22848,...",,,,
733,733,345,345_inclusion_jpmorgan_indian bonds_index,[],,,,,,18.0,59.0,"[12695, 13305, 21444, 21625, 22275, 28823, 293...",,,,
749,749,361,361_day mood_mid day_mood_mid,[],,,,,,17.0,54.0,"[1006, 2530, 6459, 8335, 30295, 30873, 32484, ...",,,,


In [191]:
import numpy as np

In [195]:
random_businessworld_articles = np.random.choice([x for x in article_story_cluster_mapping if article_story_cluster_mapping[x]['storyline_id'] in [77, 102, 129, 193, 209, 215, 286, 302]], 100)

In [196]:
bw_articles = {}

In [197]:
from tqdm.notebook import tqdm

In [202]:
for art in tqdm(random_businessworld_articles):
    article = ArticleService.get_article_json_from_s3_and_api(art)
    bw_articles[art] = {}
    bw_articles[art]['article'] = article
    bw_articles[art]['title'] = article['title']

  0%|          | 0/100 [00:00<?, ?it/s]

In [207]:
bw_articles['6555c0bf4b13023f9348cc9c']

{'article': {'meta_data': {'is_premium_article': False,
   'title': 'Filinvest Land expects higher earnings this year - BusinessWorld Online',
   'short_description': 'LISTED property developer Filinvest Land, Inc. (FLI) is bullish about its prospects for the rest of the year and expects profit growth for 2023 as the economy improves. “The Philippines is growing, and we are optimistic that we will grow more unless there is anything happening outside of the Philippines. Here within the country, our […]',
   'published_time': '2023-08-28T16:05:39+00:00',
   'last_updated_time': '2023-08-28T11:18:48+00:00',
   'image_url': 'https://www.bworldonline.com/wp-content/uploads/2021/11/Filinvest-Land-logo.jpg',
   'authors': ['CEDadiantiTyClea'],
   'tags': [],
   'read_time': '2 minutes',
   'is_document': True},
  'cleaned_text': 'LISTED property developer Filinvest Land, Inc. (FLI) is bullish about its prospects for the rest of the year and expects profit growth for 2023 as the economy improv

In [205]:
[(i, x['title']) for i, x in bw_articles.items()]

[('657135c953e8a6117bd31f06',
  'DoE calls for accelerated retirement, repurposing of coal-fired power plants - BusinessWorld Online'),
 ('6555bea84b13023f9348c4ba',
  'T-bill, bond yields likely to inch higher - BusinessWorld Online'),
 ('6555c1f24b13023f9348d0f0',
  'Peso may move sideways vs dollar ahead of inflation data - BusinessWorld Online'),
 ('6555c12d4b13023f9348ce74',
  'Yields on gov’t debt rise - BusinessWorld Online'),
 ('6555cc344b13023f934a63d5',
  'Rice prices soar, fanning fears of food inflation spike in Asia'),
 ('6555c0b84b13023f9348cc82',
  'Peso strengthens on dovish Fed minutes - BusinessWorld Online'),
 ('6555bec24b13023f9348c530',
  'DoE says still on track to hit renewables goal - BusinessWorld Online'),
 ('6555c8584b13023f93492638',
  'Maynilad’s modular plant starts treating water - BusinessWorld Online'),
 ('6555c47d4b13023f9348d8ba',
  'Maynilad sets aside P1.14B to upgrade water treatment plants - BusinessWorld Online'),
 ('6555c4e94b13023f9348da91',
  

In [190]:
[x for x in article_story_cluster_mapping if article_story_cluster_mapping[x]['storyline_id'] in [77, 102, 129, 193, 209, 215, 286, 302]]

['6555c35c4b13023f9348d419',
 '6555c05e4b13023f9348cb0b',
 '6555beaf4b13023f9348c4e3',
 '6555c4844b13023f9348d8cc',
 '6555c2dc4b13023f9348d30e',
 '6560dd95e1d8342798a474a3',
 '6555c0e64b13023f9348cd40',
 '6555c1374b13023f9348cea0',
 '6555c03e4b13023f9348ca8e',
 '6555c0774b13023f9348cb69',
 '6555c4664b13023f9348d863',
 '6555c9c24b13023f93492bb1',
 '6555c0c44b13023f9348ccb8',
 '6555bfd84b13023f9348c8ab',
 '6555c4fe4b13023f9348daeb',
 '6555c3504b13023f9348d3fd',
 '6555c0524b13023f9348cada',
 '6555c0094b13023f9348c981',
 '6566a9b239bfc8784efe5e2d',
 '6555c40d4b13023f9348d6d1',
 '6555c08d4b13023f9348cbd1',
 '6555bf844b13023f9348c76d',
 '6555c0654b13023f9348cb1a',
 '6555c4fe4b13023f9348daea',
 '6555c23f4b13023f9348d1b6',
 '6555c2634b13023f9348d214',
 '656558711fc4586a032ad7f9',
 '6555c4c64b13023f9348d9f1',
 '6555c4884b13023f9348d8de',
 '6555c42c4b13023f9348d76f',
 '6560cf2fe1d8342798a45498',
 '6555c0a24b13023f9348cc2d',
 '6531693c1e5cc42b1b13d749',
 '6555c07c4b13023f9348cb82',
 '65686bb0207b

In [199]:
k = ArticleService.get_article_json_from_s3_and_api('6555bfd84b13023f9348c8ab')

In [200]:
k.keys()

dict_keys(['meta_data', 'cleaned_text', 'article_images', 'articleId', 'url', 'title', 'shortDescription', 'publishedTime', 'lastUpdatedTime', 'tags', 'articleImageUrl', 'category', 'authors', 'isPremiumArticle', 'source', 'sourceId', 'sourceName', 'sourceLogo'])

In [77]:
stop_words = ['cnbc', 'businessworld']

In [164]:
# exclusion_story_ids = [1, 45, 320, 326, 347, 366, 370, 30, 33, 44, 55, 67, 105, 209, 216, 309, 74, 234]

In [208]:
exclusion_story_ids = [77, 102, 129, 193, 209, 215, 286, 302]

In [209]:
df[df.parent_id.isin(exclusion_story_ids)]

Unnamed: 0.1,Unnamed: 0,parent_id,parent_name,child_storyline_list,child_left_id,child_left_name,child_right_id,child_right_name,distance,level,num_docs,doc_list,cluster_id,outlier_count,total_count,outlier_percent
465,465,77,77_businessworld online_businessworld_online_acen,[],,,,,,8.0,240.0,"[75, 122, 2595, 3037, 3601, 3771, 4678, 4733, ...",,,,
490,490,102,102_businessworld online_businessworld_online_...,[],,,,,,8.0,194.0,"[161, 334, 922, 1128, 1138, 1520, 2010, 3108, ...",,,,
517,517,129,129_bsp_businessworld online_businessworld_online,[],,,,,,8.0,162.0,"[1325, 2980, 3506, 3767, 3896, 4064, 4862, 490...",,,,
581,581,193,193_rice_export_basmati_non basmati,[],,,,,,5.0,118.0,"[3848, 3999, 6239, 6852, 11674, 12595, 12801, ...",,,,
597,597,209,209_businessworld_businessworld online_philipp...,[],,,,,,8.0,107.0,"[1899, 1993, 2837, 4447, 4787, 6321, 6400, 673...",,,,
603,603,215,215_rice_businessworld online_businessworld_on...,[],,,,,,5.0,101.0,"[46, 63, 802, 1585, 3838, 6352, 6534, 7159, 79...",,,,
674,674,286,286_online psei_online shares_psei_businesswor...,[],,,,,,6.0,74.0,"[18, 4454, 5444, 6277, 8508, 8665, 10033, 1009...",,,,
690,690,302,302_peso_online peso_vs dollar_dollar business...,[],,,,,,5.0,70.0,"[2448, 3262, 4820, 9566, 10111, 10203, 10840, ...",,,,


In [210]:
non_india_fin_stories = [77, 102, 129, 193, 209, 215, 286, 302]

In [211]:
non_fin_stories = list(set(exclusion_story_ids) - set(non_india_fin_stories))

#### Marking documents of irrelevant stories

In [212]:
from sql.PostgresDatabaseOperation import PostgresDatabaseOperation

In [213]:
non_india_docs = [x for x in article_story_cluster_mapping if article_story_cluster_mapping[x]['storyline_id'] in non_india_fin_stories]

In [214]:
non_fin_docs = [x for x in article_story_cluster_mapping if article_story_cluster_mapping[x]['storyline_id'] in non_fin_stories]

In [215]:
len(non_india_docs), len(non_fin_docs)

(1066, 0)

##### Non India financial news

In [216]:
with PostgresDatabaseOperation() as cursor:
    sql = """INSERT INTO manual_article_attributes (article_id, financial_news, relevant_for_india)
            VALUES (%s, %s, %s)
            """
    for doc in non_india_docs:
        cursor.execute(sql, (doc, True, False))

##### Non financial India news

In [177]:
with PostgresDatabaseOperation() as cursor:
    sql = """INSERT INTO manual_article_attributes (article_id, financial_news, relevant_for_india)
            VALUES (%s, %s, %s)
            """
    for doc in non_fin_docs:
        cursor.execute(sql, (doc, False, True))

### Saving clustering details to DB

In [461]:
with initialize(config_path="../../conf"):
    # Compose the configuration
    cfg = compose(config_name="ClustersResetService.yaml")

In [462]:
bertopic_model = load_bertopic_model_from_s3(run_id=run_id, hf_embedding_model_name='BAAI/bge-large-en-v1.5')

bertopic streamed successfully


In [463]:
import time

In [464]:
num_docs = len(bertopic_model.topics_)
random_state = 86
model_name = 'BAAI/bge-large-en-v1.5'
embedding_size = 1024

In [469]:
from sql.clustering.ClusteringSQL import ClusteringSQL

In [466]:
def save_clustering_run_details_to_db():
    run_details_dict = dict(cfg)
    run_details_dict['num_docs'] = num_docs
    run_details_dict['random_state'] = random_state
    run_details_dict['run_time'] = time.time()
    run_details_dict['run_id'] = run_id
    run_details_dict['embedding_model_name'] = model_name
    run_details_dict['embedding_size'] = embedding_size

    ClusteringSQL.insert_clustering_run_details(clustering_run_config=run_details_dict,
                                                clustering_run_id=run_id)

In [470]:
save_clustering_run_details_to_db()

In [247]:
from copy import deepcopy

In [471]:
renamed_hierarchy_df = deepcopy(df)
renamed_hierarchy_df = renamed_hierarchy_df.rename(columns={'cluster_id': 'parent_id', 'cluster_name': 'parent_name', 'storylines': 'child_storyline_list',
                                                        'left_child_id': 'child_left_id', 'left_child_name': 'child_left_name',
                                                        'right_child_id': 'child_right_id', 'right_child_name': 'child_right_name'})

In [472]:
ClusteringSQL.save_article_story_cluster_mapping(article_story_cluster_mapping=article_story_cluster_mapping, clustering_run_id=run_id)

KeyboardInterrupt: 

In [None]:
for i, row in renamed_hierarchy_df.head(10).iterrows():
    print(row['parent_id'])

In [None]:
1

In [264]:
cols = []
count = 1
for column in renamed_hierarchy_df.columns:
    if column == 'parent_id':
        cols.append(f'parent_id{count}')
        count+=1
        continue
    cols.append(column)
renamed_hierarchy_df.columns = cols

In [265]:
renamed_hierarchy_df.isnull().mean()

Unnamed: 0              0.000000
parent_id1              0.000000
parent_name             0.000000
child_storyline_list    0.000000
child_left_id           0.501348
child_left_name         0.501348
child_right_id          0.501348
child_right_name        0.501348
distance                0.501348
level                   0.001348
num_docs                0.001348
doc_list                0.001348
parent_id2              0.970350
outlier_count           0.970350
total_count             0.970350
outlier_percent         0.970350
dtype: float64

In [266]:
renamed_hierarchy_df = renamed_hierarchy_df.rename(columns = {'parent_id1': 'parent_id'})

In [272]:
renamed_hierarchy_df.dtypes

Unnamed: 0                int64
parent_id                 int64
parent_name              object
child_storyline_list     object
child_left_id           float64
child_left_name          object
child_right_id          float64
child_right_name         object
distance                float64
level                   float64
num_docs                float64
doc_list                 object
parent_id2              float64
outlier_count           float64
total_count             float64
outlier_percent         float64
dtype: object

In [270]:
import ast

In [271]:
renamed_hierarchy_df['child_storyline_list'] = renamed_hierarchy_df['child_storyline_list'].apply(lambda x: ast.literal_eval(x))

In [279]:
from datetime import datetime

In [288]:
pd.isna(renamed_hierarchy_df.iloc[432].child_left_id)

True

In [291]:
renamed_hierarchy_df.sample(30)

Unnamed: 0.1,Unnamed: 0,parent_id,parent_name,child_storyline_list,child_left_id,child_left_name,child_right_id,child_right_name,distance,level,num_docs,doc_list,parent_id2,outlier_count,total_count,outlier_percent
738,738,367,367_yatharth_yatharth hospital_hospital_hospit...,[],,,,,,10.0,51.0,"[5667, 6897, 10221, 10492, 14911, 16263, 23354...",,,,
380,380,9,9_tcs_infosys_hcl_hcl tech,[],,,,,,13.0,774.0,"[251, 364, 913, 965, 1203, 1293, 1418, 1781, 2...",,,,
662,662,291,291_alibaba_jack ma_ma_jack,[],,,,,,27.0,72.0,"[1401, 1554, 2614, 5215, 5384, 5642, 6497, 713...",,,,
253,253,487,toubro_larsen toubro_larsen_bags_order,"[31, 70]",31.0,toubro_larsen toubro_larsen_finance holdings_t...,70.0,irb_kalpataru_infra_worth rs_nbcc,0.877703,9.0,633.0,"[1, 120, 243, 344, 930, 1171, 1415, 1423, 2119...",,,,
553,553,182,182_formula_regression_variance_statistics,[],,,,,,8.0,120.0,"[64, 719, 2225, 2745, 3287, 3998, 4988, 8528, ...",,,,
40,40,700,innerworth mind_innerworth_mind markets_mind_c...,"[1, 14, 20, 23, 28, 33, 37, 44, 51, 54, 55, 61...",647.0,job_retirement_financial_work_hiring,696.0,innerworth_mind markets_innerworth mind_mind_c...,1.191498,14.0,14136.0,"[779, 2319, 3563, 7157, 19751, 20384, 20682, 2...",,,,
712,712,341,341_titagarh_titagarh rail_rail_rail systems,[],,,,,,10.0,57.0,"[620, 685, 5810, 9585, 12843, 14372, 20309, 22...",,,,
341,341,399,standalone_2023 net_net sales_sales rs_sales,"[0, 40, 43, 65, 76]",65.0,chem_chem standalone_chem consolidated_fert_ch...,385.0,standalone_2023 net_net sales_sales rs_sales,0.707392,14.0,2814.0,"[669, 948, 1057, 1689, 2047, 3350, 5700, 6104,...",,,,
212,212,528,gdp_growth_gdp growth_economy_india gdp,"[16, 247]",247.0,budget_fiscal deficit_deficit_fiscal_budget 2023,16.0,gdp_growth_gdp growth_economy_india gdp,0.91573,19.0,644.0,"[1479, 2683, 3269, 4123, 5445, 5579, 5761, 640...",,,,
296,296,444,ai_openai_chatgpt_altman_generative,"[18, 183]",183.0,openai_altman_sam altman_sam_microsoft,18.0,ai_chatgpt_generative_generative ai_openai,0.820072,12.0,661.0,"[937, 2056, 2783, 3413, 3725, 5869, 6141, 6996...",,,,


In [290]:
renamed_hierarchy_df['child_left_id'] = renamed_hierarchy_df['child_left_id'].apply(lambda x: int(x) if pd.notna(x) else None)
renamed_hierarchy_df['child_right_id'] = renamed_hierarchy_df['child_right_id'].apply(lambda x: int(x) if pd.notna(x) else None)

In [297]:
with PostgresDatabaseOperation() as cursor:
    # first truncate the data in live table
    truncate_sql = f"""
            TRUNCATE TABLE cluster_hierarchy RESTART IDENTITY
    """
    cursor.execute(truncate_sql)

    # Insert the data in live and history tables
    insert_sql = f"""
        INSERT INTO cluster_hierarchy (parent_id, parent_name, child_storyline_list, child_left_id, child_left_name, child_right_id, child_right_name, clustering_run_id, created_at)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

    history_sql = f"""
        INSERT INTO cluster_hierarchy_history (parent_id, parent_name, child_storyline_list, child_left_id, child_left_name, child_right_id, child_right_name, clustering_run_id, created_at)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    current_time = datetime.now()
    for index, row in renamed_hierarchy_df.iterrows():
        if pd.isna(row['child_left_id']):
            child_left_id = None
            child_right_id = None
        else:
            child_left_id = int(row['child_left_id'])
            child_right_id = int(row['child_right_id'])
        cursor.execute(insert_sql, (row['parent_id'], row['parent_name'], row['child_storyline_list'], child_left_id, row['child_left_name'],
                                        child_right_id, row['child_right_name'], run_id, current_time))
        cursor.execute(history_sql, (row['parent_id'], row['parent_name'], row['child_storyline_list'], child_left_id, row['child_left_name'],
                                         child_right_id, row['child_right_name'], run_id, current_time))

In [277]:
renamed_hierarchy_df.child_left_id.max()

737.0

In [298]:
# ClusteringSQL.insert_cluster_hierarchy(cluster_hierarchy_df=renamed_hierarchy_df,
#                                                clustering_run_id=run_id)

In [300]:
ClusteringSQL.insert_storyline_to_cluster_mapping(story_to_cluster_mapping=story_cluster_mapping, clustering_run_id=run_id)

In [313]:
story_embeddings = load_json_from_s3(run_id = run_id, json_file_name='story_embeddings')

In [315]:
int_story_embeddings = {}

In [316]:
for story in story_embeddings:
    int_story_embeddings[int(story)] = story_embeddings[story]

In [325]:
with PostgresDatabaseOperation() as cursor:
    truncate_sql = f"""
                    TRUNCATE TABLE story_embeddings RESTART IDENTITY
                    """
    cursor.execute(truncate_sql)

    insert_sql = f"""INSERT INTO story_embeddings (story_id, story_embedding, clustering_run_id)
                    VALUES (%s, %s, %s)
                    """
    # TODO: - can't ideally assume that -1 exists. should come from clustering run - refactor
    if -1 in story_embeddings:
        total = len(int_story_embeddings) - 1
    else:
        total = len(int_story_embeddings)
    for i in range(total-1):
        story_embedding = list(int_story_embeddings[i])
        cursor.execute(insert_sql, (i, story_embedding, run_id))

### update topic preferences

In [344]:
from src.topics.TopicClusterMapping import TopicClusterMapping

In [349]:
with initialize(config_path="../../conf"):
    # Compose the configuration
    topic_cfg = compose(config_name="TopicClusterMapping.yaml")

In [350]:
tcm = TopicClusterMapping(clustering_run_id=run_id, bertopic_model=bertopic_model, cfg=topic_cfg)

In [355]:
tcm.recompute_preferences_for_all_topics()

In [363]:
import sys

In [373]:
import pickle

In [375]:
with open('bertopic_model.pkl', 'wb') as f:
    pickle.dump(bertopic_model, f)

In [369]:
sys.getsizeof(bertopic_model.probabilities_)

128

In [376]:
bertopic_model.probabilities_ = None

In [380]:
bertopic_model.topic_embeddings_ = None

In [382]:
bertopic_model.topic_representations_ = None

In [389]:
bertopic_model.topics_ = None

In [392]:
with open('hdbscan_model.pkl', 'wb') as f:
    pickle.dump(bertopic_model.hdbscan_model, f)

In [437]:
with open('umap_model.pkl', 'wb') as f:
    pickle.dump(bertopic_model.umap_model, f)

In [399]:
len(embs[0])

10

In [438]:
k = np.random.rand(1024)

In [425]:
graph = bertopic_model.umap_model.graph_

In [426]:
bertopic_model.umap_model.graph_ = None

In [436]:
bertopic_model.umap_model._raw_data = [bertopic_model.umap_model._raw_data[0]]

In [432]:
bertopic_model.umap_model.disconnection_distance

In [473]:
bertopic_model.topic_embeddings_

array([[ 3.2104e-02,  1.3843e-01, -9.1187e-02, ..., -8.8196e-02,
        -1.5710e-01,  1.4618e-02],
       [-9.1171e-03,  3.4088e-02, -2.2614e-02, ..., -1.0918e-02,
        -5.4871e-02, -1.8738e-02],
       [-8.4496e-04,  1.0614e-01, -1.8978e-03, ..., -2.9007e-02,
        -1.1804e-01, -3.4790e-02],
       ...,
       [-1.1055e-02,  4.6570e-02, -1.7258e-02, ..., -3.8666e-02,
        -2.5864e-02,  7.0632e-05],
       [-4.3671e-02,  1.4014e-01, -4.4983e-02, ..., -5.7098e-02,
        -2.2754e-01, -1.7609e-02],
       [ 4.7821e-02,  7.8735e-02, -3.3630e-02, ..., -5.5969e-02,
        -3.4454e-02, -3.3173e-02]], dtype=float16)