In [1]:
parent_folder = '/Users/ravi.tej/Desktop/ML/Recommendations/arcane/'
from hydra import compose, initialize
import os

import xml.etree.ElementTree as ET

tree = ET.parse('../../conf/application.run.xml')
root = tree.getroot()

envs_element = root.find('./configuration/envs')
for variable in envs_element.findall('env'):
    name = variable.get('name')
    value = variable.get('value')
    os.environ[name] = value

import sys
sys.path.append('/Users/ravi.tej/Desktop/ML/Recommendations/arcane/')

from src._utils import load_bertopic_model_from_hf

In [2]:
import json

In [3]:
import numpy as np
import pandas as pd

#### Reading Mongo Data

In [49]:
data_folder = "../../data/"
import json
with open(data_folder + 'insight_db_articles.json') as f:
    insight_articles = json.load(f)

In [50]:
valid_articles = {}

for k in insight_articles:
    if 'is_premium_article' in k and k['is_premium_article'] == False:
        article_id = k['_id']['$oid']
        valid_articles[article_id] = k
        valid_articles[article_id]['source_id'] = k['source_id']['$oid']

In [79]:
with PostgresDatabaseOperation() as cursor:
    sql = 'SELECT DISTINCT article_id FROM llm_article_attributes'
    cursor.execute(sql)
    results = cursor.fetchall()

In [82]:
random_articles = pd.DataFrame(results).sample(500)

In [86]:
random_articles.to_csv('sample_articles.csv')

In [87]:
!pwd

/Users/ravi.tej/Desktop/ML/Recommendations/arcane/notebooks/articles


#### Getting all article embeddings

In [130]:
from sql.PostgresDatabaseOperation import PostgresDatabaseOperation

In [131]:
with PostgresDatabaseOperation() as cursor:
    sql = 'SELECT DISTINCT article_id from embeddings'
    cursor.execute(sql)
    results = cursor.fetchall()

In [132]:
db_article_ids = [x[0] for x in results]

#### Finding duplicate articles

In [35]:
duplicate_articles = set(db_article_ids) - set(valid_article_ids)

duplicate_articles = list(duplicate_articles)

In [33]:
chunk_size = 1000
num_chunks = int(np.ceil(len(duplicate_articles)/chunk_size))
num_chunks

59

#### Deleting the data

In [38]:
import requests

In [41]:
import time

In [43]:
res = []

In [44]:
start_time = time.time()
for i in range(num_chunks):
    cur_chunk = duplicate_articles[i*chunk_size:(i+1)*chunk_size]
    chunk_string = ','.join(cur_chunk)
    r = requests.post('http://Arcane-env.eba-mrsaixmg.ap-south-1.elasticbeanstalk.com/delete_article_ids_from_ml_db', json={'articleIds': chunk_string})
    res.append(r)
    if (i+1)%5==0:
        print(f'done with {i+1} in {int(time.time() - start_time)} seconds')

done with 5 in 9 seconds
done with 10 in 20 seconds
done with 15 in 32 seconds
done with 20 in 94 seconds
done with 25 in 128 seconds
done with 30 in 146 seconds
done with 35 in 162 seconds
done with 40 in 198 seconds
done with 45 in 219 seconds
done with 50 in 242 seconds
done with 55 in 259 seconds


#### Creating missing embeddings

In [94]:
missing_articles = set(valid_article_ids) - set(db_article_ids)

In [95]:
missing_articles = list(missing_articles)

In [96]:
len(missing_articles)

382

In [68]:
def add_missing_article_to_db(article_id):
    r = requests.post('http://Arcane-env.eba-mrsaixmg.ap-south-1.elasticbeanstalk.com/create_embedding', json={'articleId': article_id})
    if r.status_code == 200:
        return article_id

In [18]:
from datetime import datetime

from concurrent.futures import ThreadPoolExecutor, as_completed

In [70]:
completed_article_ids = []

In [71]:
from tqdm.notebook import tqdm

In [98]:
num_chunks = len(missing_articles)//1000 + 1
for i in range(num_chunks):
    article_chunk = missing_articles[i*1000:(i+1)*1000]
    print(f'starting creation at {"{:%b %d, %Y %H:%M}".format(datetime.now())}')
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(add_missing_article_to_db, art_id) for art_id in article_chunk]

        for future in as_completed(futures):
            response = future.result()
            if response:  # Timeout or other errors
                completed_article_ids.append(response)
                continue
    print(f'completed chunk {(i + 1) * 1000} articles at {"{:%b %d, %Y %H:%M}".format(datetime.now())} in {int(time.time() - start_time)/60} minutes')

starting creation at Dec 03, 2023 10:17
completed chunk 1000 articles at Dec 03, 2023 10:21 in 3.6666666666666665 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 2000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 3000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 4000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 5000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 6000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 7000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 8000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at Dec 03, 2023 10:21
completed chunk 9000 articles at Dec 03, 2023 10:21 in 0.0 minutes
starting creation at D

#### Missing from cluster_mapping

In [105]:
from sql.PostgresDatabaseOperation import PostgresDatabaseOperation

In [106]:
with PostgresDatabaseOperation() as cursor:
    sql = 'SELECT DISTINCT article_id from article_to_cluster_mapping'
    cursor.execute(sql)
    results = cursor.fetchall()

In [107]:
cluster_db_article_ids = [x[0] for x in results]

In [108]:
with PostgresDatabaseOperation() as cursor:
    sql = 'SELECT DISTINCT article_id from embeddings'
    cursor.execute(sql)
    results = cursor.fetchall()

In [109]:
embeddings_db_article_ids = [x[0] for x in results]

In [110]:
missing_articles = set(embeddings_db_article_ids) - set(cluster_db_article_ids)
missing_articles = list(missing_articles)

In [111]:
len(missing_articles)

11

In [97]:
def assign_missing_article_to_cluster(article_id):
    r = requests.post('http://Arcane-env.eba-mrsaixmg.ap-south-1.elasticbeanstalk.com/assign_cluster', json={'articleId': article_id})
    if r.status_code == 200:
        return article_id

In [98]:
num_chunks = len(missing_articles)//200 + 1

In [99]:
from datetime import datetime
import time
import requests

In [100]:
completed_article_ids = []

In [101]:
for i in range(num_chunks):
    article_chunk = missing_articles[i*200:(i+1)*200]
    print(f'starting creation at {"{:%b %d, %Y %H:%M}".format(datetime.now())}')
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(assign_missing_article_to_cluster, art_id) for art_id in article_chunk]
        for future in as_completed(futures):
            response = future.result()
    print(f'completed chunk {(i + 1) * 200} articles at {"{:%b %d, %Y %H:%M}".format(datetime.now())} in {int(time.time() - start_time)/60} minutes')

starting creation at Dec 10, 2023 10:22
completed chunk 200 articles at Dec 10, 2023 10:42 in 19.95 minutes
starting creation at Dec 10, 2023 10:42
completed chunk 400 articles at Dec 10, 2023 11:02 in 20.05 minutes
starting creation at Dec 10, 2023 11:02
completed chunk 600 articles at Dec 10, 2023 11:22 in 20.05 minutes
starting creation at Dec 10, 2023 11:22
completed chunk 800 articles at Dec 10, 2023 11:42 in 20.05 minutes
starting creation at Dec 10, 2023 11:42
completed chunk 1000 articles at Dec 10, 2023 12:02 in 20.066666666666666 minutes
starting creation at Dec 10, 2023 12:02
completed chunk 1200 articles at Dec 10, 2023 12:22 in 20.066666666666666 minutes
starting creation at Dec 10, 2023 12:22
completed chunk 1400 articles at Dec 10, 2023 12:32 in 9.966666666666667 minutes
starting creation at Dec 10, 2023 12:32
completed chunk 1600 articles at Dec 10, 2023 12:36 in 3.3333333333333335 minutes
starting creation at Dec 10, 2023 12:36
completed chunk 1800 articles at Dec 10, 

In [102]:
# r = requests.post('http://Arcane-env.eba-mrsaixmg.ap-south-1.elasticbeanstalk.com/update_candidates')