In [1]:
parent_folder = '/Users/username/Desktop/ML/Recommendations/arcane/'
from hydra import compose, initialize
import os

import xml.etree.ElementTree as ET

tree = ET.parse('../../conf/application.run.xml')
root = tree.getroot()

envs_element = root.find('./configuration/envs')
for variable in envs_element.findall('env'):
    name = variable.get('name')
    value = variable.get('value')
    os.environ[name] = value

import sys
sys.path.append('/Users/username/Desktop/ML/Recommendations/arcane/')

from src._utils import load_bertopic_model_from_hf

### Getting candidate embeddings

In [19]:
import requests
import json
import numpy as np
import pandas as pd

In [5]:
from sql.PostgresDatabaseOperation import PostgresDatabaseOperation

In [23]:
with PostgresDatabaseOperation() as cursor:
    sql = '''SELECT DISTINCT c.article_id, e.embedding 
    FROM candidate_articles c INNER JOIN embeddings e
    ON c.article_id = e.article_id
    '''
    cursor.execute(sql)
    results = cursor.fetchall()

In [24]:
df = pd.DataFrame(results)

In [26]:
embeddings = {}

In [27]:
for res in results:
    embeddings[res[0]] = [np.float16(x) for x in json.loads(res[1])]

In [36]:
# SSH tunnel configuration
SSH_HOST=''
SSH_USER='ec2-user'
SSH_KEY_PATH=''
LOCAL_BIND_PORT=3010

# MongoDB server configuration
MONGO_HOST='insight-beta-article-db2.chmaiagjipqy.ap-south-1.docdb.amazonaws.com'
MONGO_PORT=27017
MONGO_USERNAME=''
MONGO_PASSWORD=''


MONGO_DB_NAME='insight_db'
MONGO_COLLECTION_NAME='articles'

# test environment
ENVIRONMENT='local'

In [37]:
DB_USERNAME = MONGO_USERNAME
DB_PASSWORD=MONGO_PASSWORD

In [38]:

import os
from pymongo import MongoClient
from sshtunnel import SSHTunnelForwarder

# # SSH tunnel configuration
# SSH_HOST = os.environ.get('SSH_HOST')
# SSH_USER = os.environ.get('SSH_USER')
# SSH_KEY_PATH = os.environ.get('SSH_KEY_PATH')
# LOCAL_BIND_PORT = int(os.environ.get('LOCAL_BIND_PORT', 3010))


# # MongoDB server configuration
# MONGO_HOST = os.environ.get('MONGO_HOST')
# MONGO_PORT = int(os.environ.get('MONGO_PORT', 27017))
# DB_USERNAME = os.environ.get('MONGO_USERNAME')
# DB_PASSWORD = os.environ.get('MONGO_PASSWORD')

# db parameters dict
DB_PARAMS = {
    "host": '127.0.0.1',
    "port": LOCAL_BIND_PORT,
    "username": DB_USERNAME,
    "password": DB_PASSWORD,
}


class MongoDatabaseConnection:
    _instance = None
    _client = None
    _tunnel = None

    def __new__(cls):
        if cls._instance is None:
            try:
                cls._instance = super(MongoDatabaseConnection, cls).__new__(cls)

                if os.environ.get('ENVIRONMENT') == 'local':
                    # initiate mongo client via ssh tunneling
                    cls._instance._client = cls._connect_to_mongodb_using_ssh()
                else:
                    # initiate mongo client
                    cls._instance._client = cls._connect_to_mongodb()
            except Exception as e:
                raise Exception(f"Failed to connect to MongoDB: {e}")

        return cls._instance

    @classmethod
    def get_client(cls):
        return cls._instance._client

    @classmethod
    def close_connection(cls):
        if cls._instance._client:
            cls._instance._client.close()
            cls._instance._client = None

        if cls._tunnel:
            cls._tunnel.stop()
            cls._tunnel = None

    @classmethod
    def _establish_tunnel(cls):
        try:
            tunnel = SSHTunnelForwarder(
                (SSH_HOST, 22),
                ssh_username=SSH_USER,
                ssh_pkey=SSH_KEY_PATH,
                remote_bind_address=(MONGO_HOST, MONGO_PORT),
                local_bind_address=('127.0.0.1', LOCAL_BIND_PORT)
            )

            return tunnel
        except Exception as e:
            raise Exception(f"Failed to establish SSH tunnel: {e}")

    @classmethod
    def _connect_to_mongodb(cls):
        try:
            return MongoClient(**DB_PARAMS)
        except ConnectionError as mongo_error:
            raise Exception(f"Failed to connect to MongoDB Connection: {mongo_error}")

    @classmethod
    def _connect_to_mongodb_using_ssh(cls):
        try:
            # start the tunnel
            cls._tunnel = cls._establish_tunnel()
            cls._tunnel.start()

            return MongoClient(
                directConnection=True,
                **DB_PARAMS
            )
        except ConnectionError as mongo_error:
            raise Exception(f"Failed to connect to MongoDB Connection: {mongo_error}")

In [39]:
from bson import ObjectId

In [40]:
class MongoDBArticle:

    @staticmethod
    def get_db():
        mongo_conn = None
        try:
            # setup mongo connection
            mongo_conn = MongoDatabaseConnection()
            mongo_client = mongo_conn.get_client()
            db = mongo_client[MONGO_DB_NAME]
            return db
        except Exception as e:
            if mongo_conn:
                mongo_conn.close_connection()
            raise Exception(f"Error in connecting to db: {e}")

    @staticmethod
    def get_collection(collection_name=None):
        try:
            db = MongoDBArticle.get_db()
            if collection_name is None:
                return db[MONGO_COLLECTION_NAME]
            else:
                return db[collection_name]
        except Exception as e:
            raise Exception(f"Error in _get_collection: {e}")

    @staticmethod
    def fetch_documents_by_ids(string_ids, max_published_date):
        try:
            object_ids = [ObjectId(string_id) for string_id in string_ids]
        except Exception as e:
            return f"Invalid ID format: {e}"

        collection = MongoDBArticle.get_collection()
        documents =  list(collection.find({"_id": {"$in": object_ids}, "published_time": {"$lt": max_published_date}}))
        doc_dict = {}
        for doc in documents:
            article_id = str(doc.pop('_id'))
            doc_dict[article_id] = doc
            doc_dict[article_id]['article_id'] = article_id
            doc_dict[article_id]['source_id'] = str(doc['source_id'])
            doc_dict[article_id]['is_premium_article'] = False
        return doc_dict
    
    @staticmethod
    def fetch_all_documents():
        query = {"is_premium_article": False}
        # projection = {"_id": 1}; , projection

        collection = MongoDBArticle.get_collection()
        documents = list(collection.find(query))
        doc_dict = {}
        for doc in documents:
            article_id = str(doc.pop('_id'))
            doc_dict[article_id] = doc
            doc_dict[article_id]['article_id'] = article_id
        return doc_dict
    
    @staticmethod
    def fetch_all_llm_responses():
        query = {"response": {"$exists": True}}
        # projection = {"_id": 1}; , projection

        collection = MongoDBArticle.get_collection()
        documents = list(collection.find(query))
        doc_dict = {}
        for doc in documents:
            _id = str(doc.pop('_id'))
            doc_dict[article_id] = doc['article_id']
            doc_dict[article_id] = doc
            doc_dict[article_id]['_id'] = _id
            try:
                binary_data = base64.b64decode(doc['response'])
                parsed_response = json.loads(json.loads(binary_data)[0]['generated_text'])
            except:
                parsed_response = {}
                parsed_response['top_categories'] = ''
            doc_dict['parsed_response'] = parsed_response
        return doc_dict

In [33]:
candidate_ids = list(embeddings.keys())

In [42]:
candidate_docs = MongoDBArticle.fetch_documents_by_ids(candidate_ids,max_published_date='2025-01-01')

In [71]:
candidate_articles = {k: Article.from_dict(v).full_content for k, v in candidate_docs.items()}

candidate_articles = {k: v for k, v in candidate_articles.items() if len(v.split(' ')) > 50}

In [74]:
candidate_article_ids = list(candidate_articles.keys())

In [139]:
candidate_article_objs = [Article.from_dict(v) for k, v in candidate_docs.items() if k in candidate_article_ids]

In [75]:
candidate_articles = [candidate_articles[k] for k in candidate_article_ids]

In [95]:
candidate_embeddings = [embeddings[k] for k in candidate_article_ids]

In [76]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

In [138]:
custom_umap_model = UMAP(n_neighbors=32,
                                 n_components=8,
                                 metric='cosine',
                                 random_state=86)
custom_hdbscan_model = HDBSCAN(min_cluster_size=2,
                               min_samples=2,
                               metric='manhattan',
                               prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english",
                                   max_df=0.7,
                                   ngram_range=(1, 2))
bertopic_model = BERTopic(umap_model=custom_umap_model,
                          hdbscan_model=custom_hdbscan_model,
                          vectorizer_model=vectorizer_model,
                          embedding_model=SentenceTransformer('BAAI/bge-large-en-v1.5'),
                          verbose=True,
                          calculate_probabilities=True)

In [96]:
import time

In [98]:
start_time = time.time()
bertopic_model.fit_transform(documents=candidate_articles, embeddings=np.array(candidate_embeddings))
print(f'fitting done in {time.time() - start_time} seconds')
bertopic_model.generate_topic_labels(nr_words=5, separator=", ")
print(f'labels generated in {time.time() - start_time} seconds')

2024-01-31 13:20:12,542 - BERTopic - Reduced dimensionality
2024-01-31 13:32:38,811 - BERTopic - Clustered reduced embeddings


fitting done in 800.805055141449 seconds
labels generated in 800.8072772026062 seconds


In [99]:
from src.clustering.ClusterHierarchyService import ClusterHierarchyService

In [105]:
with initialize(config_path="../../conf"):
    # Compose the configuration
    cfg = compose(config_name="ClustersResetService.yaml")

In [106]:
cluster_hierarchy = ClusterHierarchyService(bertopic_model=bertopic_model, documents=candidate_articles, cfg=cfg)

100%|██████████| 955/955 [00:24<00:00, 38.23it/s]


In [107]:
cluster_hierarchy.setup()

In [112]:
def assign_articles_to_clusters():
    article_story_mapping = bertopic_model.topics_
    article_story_probs = bertopic_model.probabilities_

    article_story_cluster_mapping = {}
    for article_idx, story_id in enumerate(article_story_mapping):
        article_dict = assign_article_to_cluster(article_story_id=story_id,
                                                 article_story_probs=np.array(article_story_probs[article_idx]),
                                                 story_cluster_map=cluster_hierarchy.story_cluster_map)

        article_id = candidate_article_ids[article_idx]
        article_story_cluster_mapping[article_id] = article_dict

    return article_story_cluster_mapping

In [113]:
from src.clustering._utils import assign_article_to_cluster

In [114]:
article_story_cluster_map = assign_articles_to_clusters()

In [117]:
df = cluster_hierarchy.hierarchy_df

In [137]:
len([x for x in article_story_cluster_map if article_story_cluster_map[x]['storyline_id'] == -1])

5226

In [133]:
df[df.left_child_id.isnull()].num_docs.describe([0.8,0.9,0.95])

count    956.000000
mean       9.712343
std       15.311218
min        3.000000
50%        5.000000
80%       11.000000
90%       19.000000
95%       29.000000
max      228.000000
Name: num_docs, dtype: float64

In [134]:
df[df.left_child_id.isnull()].tail(20)

Unnamed: 0,cluster_id,cluster_name,storylines,left_child_id,left_child_name,right_child_id,right_child_name,distance,level,num_docs,doc_list
1892,936,936_johnson_jnj_johnson johnson_wolk,[],,,,,,18.0,3.0,"[2564, 9191, 14156]"
1893,937,937_brexit_eu_britains_trade deal,[],,,,,,18.0,3.0,"[2557, 10933, 14218]"
1894,938,938_jio_jio financial_blackrock_principle appr...,[],,,,,,15.0,3.0,"[7288, 8226, 10750]"
1895,939,939_home health_hospital_health care_care,[],,,,,,16.0,3.0,"[7339, 8723, 10945]"
1896,940,940_macfadyen_braun_tom_greg,[],,,,,,25.0,3.0,"[7356, 8553, 9861]"
1897,941,941_bail_court_statutory bail_25 cases,[],,,,,,14.0,3.0,"[7381, 7775, 14384]"
1898,942,942_steel_chandrasekaran_steel prices_tata sons,[],,,,,,22.0,3.0,"[7406, 14182, 14266]"
1899,943,943_enset_heat_borrell_ethiopia,[],,,,,,11.0,3.0,"[2462, 2680, 5962]"
1900,944,944_crypto_cryptocurrency_investing crypto_giv...,[],,,,,,12.0,3.0,"[2413, 3024, 3425]"
1901,945,945_rule_new rule_workers_business groups,[],,,,,,19.0,3.0,"[7484, 10548, 12643]"


In [120]:
article_story_cluster_map

{'651dc434a662d76276b7b11f': {'storyline_id': 175,
  'storyline_prob': 1.0,
  'story_cluster_id': 1910,
  'max_agg_cluster_id': 1910,
  'agg_cluster_prob': 1.0,
  'cluster_id': 1910},
 '651dc49da662d76276b7b21f': {'storyline_id': -1,
  'storyline_prob': 0.7328297297692198,
  'story_cluster_id': -1,
  'max_agg_cluster_id': 1910,
  'agg_cluster_prob': 0.2671702702307802,
  'cluster_id': 1910},
 '651dc587a662d76276b7b454': {'storyline_id': -1,
  'storyline_prob': 0.021289691106756492,
  'story_cluster_id': -1,
  'max_agg_cluster_id': 1910,
  'agg_cluster_prob': 0.9787103088932435,
  'cluster_id': 1910},
 '651dc5bea662d76276b7b4d8': {'storyline_id': -1,
  'storyline_prob': 0.5872315089507824,
  'story_cluster_id': -1,
  'max_agg_cluster_id': 1910,
  'agg_cluster_prob': 0.4127684910492177,
  'cluster_id': 1910},
 '651dc60aa662d76276b7b596': {'storyline_id': 762,
  'storyline_prob': 1.0,
  'story_cluster_id': 1910,
  'max_agg_cluster_id': 1910,
  'agg_cluster_prob': 1.0,
  'cluster_id': 1910

In [130]:
gpt_articles = [candidate_article_ids[x] for x in df[df.left_child_id.isnull()].loc[959].doc_list]

In [144]:
candidate_article_objs[0]

Article(article_id='651dc434a662d76276b7b11f', title=' India has emerged as voice of Global South : G20 Sherpa Amitabh Kant underscores India narrative', is_premium_article=False, published_time='2023-09-04T13:46:00+05:30', source_id='6512cdcad01a9c8e86263e05', url='https://economictimes.indiatimes.com/news/india/india-has-emerged-as-voice-of-global-south-g20-sherpa-amitabh-kant-underscores-india-narrative/articleshow/103351034.cms', cleaned_text="India's G20 Sherpa Amitabh Kant has underscored the  unique Indian narrative,  India has set during its presidency of the G20 Summit, saying India  truly emerged as the voice of the Global South .In a short video message, shared on X, by the official handle of India's G20 Presidency, Kant said,  So India has built a unique Indian narrative during its G20 presidency.  India has truly emerged as the voice of the Global South during its G20 presidency and this is the first time that four emerging markets, one after another Indonesia, India, Braz

In [143]:
candidate_article_objs[0].published_time

'2023-09-04T13:46:00+05:30'

In [131]:
[candidate_articles[x] for x in df[df.left_child_id.isnull()].loc[959].doc_list]

['headline: AI cant substitute human intelligence in adjudication: Delhi HC.  content: Artificial intelligence (AI) can substitute neither the human intelligence nor the humane element in the adjudicatory process the Delhi High Court has held and said ChatGPT cant be the basis of adjudication of legal or factual issues in a court of law. Justice Prathiba M Singh stated that the accuracy and reliability of AI generated data is still in the grey area and at best such a tool can be utilised for a preliminary understanding or for preliminary research.  The courts observations came while dealing with a lawsuit by luxury brand Christian Louboutin against a partnership firm involved in the manufacture and sale of shoes allegedly in violation of its trademark.  The counsel for the plaintiff submitted that  Red Sole Shoe  was its registered trademark in India and placed before court responses by ChatGPT with respect to its  reputation .   The said tool (ChatGPT) cannot be the basis of adjudicat

In [145]:
df = pd.DataFrame([(1),(2),(10)])

In [149]:
df[1] = df[0].apply(lambda x: 5 if x < 2 or x > 2 else x)

In [150]:
df

Unnamed: 0,0,1
0,1,5
1,2,2
2,10,5


In [151]:
import requests

In [154]:
r = requests.post('http://Arcane-env.eba-mrsaixmg.ap-south-1.elasticbeanstalk.com/get_recent_published_document_ids',data = {'num_days':1})

In [155]:
r.text

'<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n<hr><center>nginx</center>\r\n</body>\r\n</html>\r\n'

In [81]:
!CT_METAL=1 pip install ctransformers --no-binary ctransformers

Collecting ctransformers
  Downloading ctransformers-0.2.27.tar.gz (376 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.1/376.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting py-cpuinfo<10.0.0,>=9.0.0 (from ctransformers)
  Using cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Building wheels for collected packages: ctransformers
  Building wheel for ctransformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for ctransformers: filename=ctransformers-0.2.27-cp311-cp311-macosx_14_0_arm64.whl size=513129 sha256=b715da3a2d33c4cd508ee4073235d4319870b0065d34b8a0b937f1402db2b47f
  Stored in directory: /Users/username/Library/Caches/pip/wheels/97/4a/52/527e0ba9715318161f0f7a69f5e60d53b4a1afcd62e61d3347
Successfully built ctransformers
Installing collected packages: 

In [83]:
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline

In [92]:
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF",
    model_file="openhermes-2.5-mistral-7b.Q5_K_M.gguf",
    model_type="mistral",
    gpu_layers=50,
    hf=True
)
tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", trust_remote_code=True)

# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=300,
    repetition_penalty=1.1
)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [94]:
generator('tell me a joke')

[{'generated_text': 'tell me a joke.\n\nI’m not sure I have any good ones, but here goes: Why don’t scientists trust atoms? Because they make up everything!\n\nWhat is your favorite thing about being an author?\n\nMy favorite thing about being an author is the ability to create new worlds and characters that can inspire and entertain readers. It’s also incredibly rewarding to hear from fans who have connected with my stories in some way.\n\nWhat is your least favorite part of writing?\n\nMy least favorite part of writing is probably the self-doubt and frustration that can come with the process. It can be tough to stay motivated and confident when you’re facing rejection or struggling with writer’s block.\n\nWhat advice would you give to someone who wants to become an author?\n\nMy advice for aspiring authors would be to read widely, write consistently, and seek out feedback and criticism. Don’t be afraid to fail or make mistakes – every successful writer has faced setbacks along the wa