### Objective

- Identify what is the best distance metric to find closest story for a given article - by comparing manhattan distance vs cosine similarity

Hypothesis is that cosine similarity might be poorer since the magnitude isn't considered

Findings: TLDR: - Cosine similarity wins

In [1]:
import boto3
import pandas as pd
from bertopic import BERTopic

parent_folder = '/Users/ravi.tej/Desktop/ML/Recommendations/arcane/'
from hydra import compose, initialize
import os

import xml.etree.ElementTree as ET

tree = ET.parse('../../conf/application.run.xml')
root = tree.getroot()

envs_element = root.find('./configuration/envs')
for variable in envs_element.findall('env'):
    name = variable.get('name')
    value = variable.get('value')
    os.environ[name] = value

import sys
sys.path.append('/Users/ravi.tej/Desktop/ML/Recommendations/arcane/')

from src._utils import load_bertopic_model_from_hf

import json
from io import StringIO
import pickle
from smart_open import open
from bertopic.backend._sentencetransformers import SentenceTransformerBackend

In [2]:
def load_bertopic_model_from_s3(run_id: str, hf_embedding_model_name: str) -> BERTopic:
    filename = 'BERTopic_' + run_id
    foldername = run_id
    s3_path = f's3://{s3_bucket}/{s3_bertopic_folder}/{foldername}/{filename}'

    # Stream the model directly from S3
    with open(s3_path, 'rb',
              transport_params={'client': boto3.client('s3',
                                                       aws_access_key_id=os.environ['AWS_ACCESS_KEY'],
                                                       aws_secret_access_key=os.environ['AWS_SECRET_KEY'])}) as f:
        model = pickle.load(f)
    print('bertopic streamed successfully')

    assert isinstance(model, BERTopic), f"Failed to load model from S3. Model {filename} is not of type BERTopic."
    model.embedding_model = SentenceTransformerBackend(embedding_model=hf_embedding_model_name)
    return model

In [3]:
def load_json_from_s3(run_id: str, json_file_name: str) -> BERTopic:
    foldername = run_id
    s3_resource = boto3.resource('s3', aws_access_key_id=os.environ['AWS_ACCESS_KEY'], aws_secret_access_key=os.environ['AWS_SECRET_KEY'])
    response = s3_resource.Object(s3_bucket, f'{s3_bertopic_folder}/{foldername}/{json_file_name}.json').get()
    return json.loads(response['Body'].read().decode('utf-8'))
    # return json.dumps(response)

In [4]:
def load_csv_from_s3(run_id: str, csv_file_name: str) -> BERTopic:
    foldername = run_id
    s3_resource = boto3.resource('s3', aws_access_key_id=os.environ['AWS_ACCESS_KEY'], aws_secret_access_key=os.environ['AWS_SECRET_KEY'])
    response = s3_resource.Object(s3_bucket, f'{s3_bertopic_folder}/{foldername}/{csv_file_name}.csv').get()
    csv_string = response['Body'].read().decode('utf-8')
    return pd.read_csv(StringIO(csv_string))
    # return json.dumps(response)

In [8]:
run_id = 'TomatoEmeraldCandle'
s3_bucket = 'insight-ml-models'
s3_bertopic_folder = 'bertopic'

In [9]:
article_story_cluster_mapping = load_json_from_s3(run_id = run_id, json_file_name='article_story_cluster_mapping')
story_cluster_mapping = load_json_from_s3(run_id = run_id, json_file_name='story_cluster_mapping')
story_embeddings = load_json_from_s3(run_id = run_id, json_file_name='story_embeddings')
df = load_csv_from_s3(run_id=run_id, csv_file_name='cluster_hierarchy')

In [10]:
clusters = list(set(story_cluster_mapping.values()))

In [12]:
outliers = {}
for article_id in article_story_cluster_mapping:
    if article_story_cluster_mapping[article_id]['storyline_id'] == -1:
        outliers.setdefault(article_story_cluster_mapping[article_id]['cluster_id'], []).append(article_id)

In [13]:
cluster_count = {}
for cluster in outliers:
    cluster_count[cluster] = len(outliers[cluster])

In [14]:
outliers_count_df = pd.DataFrame([(key, value) for key, value in cluster_count.items()], columns = ['cluster_id', 'outlier_count'])

In [15]:
df['parent_id'] = df['parent_id'].astype('int')

In [16]:
df = pd.merge(df, outliers_count_df, how = 'left', left_on = 'parent_id', right_on = 'cluster_id')

In [17]:
df['total_count'] = df['num_docs'] + df['outlier_count']
df['outlier_percent'] = df['outlier_count']/df['total_count']

In [20]:
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances

In [24]:
import numpy as np

In [130]:
story_names = {int(x['parent_id']): x['parent_name'] for i, x in df[df.child_left_id.isnull()].iterrows()}

### Testing article to story distances

In [83]:
import requests

In [88]:
r = requests.post('http://Arcane-env.eba-mrsaixmg.ap-south-1.elasticbeanstalk.com/get_recent_published_document_ids', json = {'num_days': 1})
recent_articles = json.loads(r.text)['articleIds']

In [157]:
sample_articles = np.random.choice(recent_articles, 30)

In [156]:
from sql.PostgresDatabaseOperation import PostgresDatabaseOperation
from src.articles.ArticleService import ArticleService

#### Getting embeddings

In [92]:
def get_embedding(article_id):
    with PostgresDatabaseOperation() as cursor:
        sql = 'SELECT embedding FROM embeddings WHERE article_id = %s'
        cursor.execute(sql, (article_id,))
        results = cursor.fetchall()
    return results[0][0]

In [158]:
sample_article_embeddings = {x: get_embedding(x) for x in sample_articles}

In [159]:
sample_article_embeddings.keys()

dict_keys(['65791e521678087a992964b2', '657956f85b3cac4e9dcc5be3', '65791e551678087a992964d0', '65791e5b1678087a99296542', '65780583b8b4d445277abb81', '65783d471678087a992963e5', '65791e4f1678087a99296490', '65791e4b1678087a9929646b', '65791ed41678087a992965ec', '657956f85b3cac4e9dcc5be5', '65791e5b1678087a99296537', '65783dbb1678087a9929643d', '65791e4f1678087a99296497', '6578050cb8b4d445277abb32', '65791e521678087a992964ae', '65791ed81678087a99296602', '65791e4d1678087a99296481', '657956855b3cac4e9dcc5b87', '6579568c5b3cac4e9dcc5bb5', '65791e5b1678087a9929653f', '65791e571678087a99296500', '6579568b5b3cac4e9dcc5baa', '65791e461678087a9929645c', '65791e521678087a992964c1', '65791e561678087a992964f4', '65780508b8b4d445277abb12', '65783d4d1678087a992963f4', '65791e551678087a992964d4'])

In [160]:
sample_article_embeddings = {x: [float(x) for x in val[0]] for x, val in sample_article_embeddings.items()}

#### Getting article data

In [161]:
sample_articles_data = {x: ArticleService.get_article_json_from_s3_and_api(x) for x in sample_articles}

#### Calculating distances to stories

In [162]:
story_wise_distances = {x: {} for x in sample_articles}

In [163]:
# calculating the cosine similarity and manhattan distance for all stories against each article except the outlier story
for x in story_wise_distances:
    story_wise_distances[x]['manhattan'] = manhattan_distances(np.array([sample_article_embeddings[x]]), list(story_embeddings.values())[1:])
    story_wise_distances[x]['cosine'] = cosine_similarity(np.array([sample_article_embeddings[x]]), list(story_embeddings.values())[1:])

In [165]:
def get_article_closest_stories(article_id):
    article_data = {'title': sample_articles_data[article_id]['title']}
    article_data['top_manhattan'] = [story_names[x] for x in np.argsort(story_wise_distances[article_id]['manhattan'])[0][:3]]
    article_data['top_cosine'] = [story_names[x] for x in np.argsort(story_wise_distances[article_id]['cosine'])[0][::-1][:3]]
    return article_data

In [166]:
closest_stories = {x: get_article_closest_stories(x) for x in sample_articles}

In [167]:
closest_stories

{'65791e521678087a992964b2': {'title': 'Sbi Life Share Price Live blog for 13 Dec 2023 | Mint',
  'top_manhattan': ['75_dec 2023_dec_05 dec_blog 05',
   '87_moneycontrol selects_selects stories_selects_moneycontrol',
   '217_day morning_started day_scan big_morning scan'],
  'top_cosine': ['75_dec 2023_dec_05 dec_blog 05',
   '234_mint sbi_sbi_sbi life_sbi closed',
   '2_live blog_blog_price live_2023 mint']},
 '657956f85b3cac4e9dcc5be3': {'title': "IMF board clears first review of Bangladesh's $4.7 billion bailout - BusinessWorld Online",
  'top_manhattan': ['87_moneycontrol selects_selects stories_selects_moneycontrol',
   '160_exam_result_result 2023_class 10',
   '217_day morning_started day_scan big_morning scan'],
  'top_cosine': ['16_gdp_growth_gdp growth_economy',
   '90_bank england_england_ecb_central bank',
   '110_cop28_climate_fossil_fossil fuel']},
 '65791e551678087a992964d0': {'title': "GTA Online's Chop Shop Update: Check out new vehicles",
  'top_manhattan': ['180_auto

In [181]:
def get_approximate_cluster_details_for_article(article_embeddings):
    # calculating cosine similarity for all stories except outlier
    cluster_details = {}
    sim_matrix = cosine_similarity([article_embeddings], list(story_embeddings.values())[1:])
    cluster_details['storyline_id'] = np.argmax(sim_matrix)  # doing -1 for the outlier
    cluster_details['cluster_id'] = story_cluster_mapping[str(cluster_details['storyline_id'])]
    return cluster_details

In [179]:
from sql.clustering.ClusteringSQL import ClusteringSQL

In [173]:
sample_articles[0]

'65791e521678087a992964b2'

In [178]:
story_cluster_mapping

{'11': 736,
 '24': 736,
 '32': 736,
 '45': 736,
 '58': 736,
 '64': 736,
 '69': 736,
 '80': 736,
 '90': 736,
 '96': 736,
 '113': 736,
 '114': 736,
 '117': 736,
 '131': 736,
 '132': 736,
 '143': 736,
 '153': 736,
 '177': 736,
 '181': 736,
 '196': 736,
 '211': 736,
 '223': 736,
 '224': 736,
 '233': 736,
 '248': 736,
 '254': 736,
 '271': 736,
 '276': 736,
 '295': 736,
 '315': 736,
 '333': 736,
 '346': 736,
 '354': 736,
 '357': 736,
 '369': 736,
 '13': 729,
 '22': 729,
 '39': 729,
 '47': 729,
 '66': 729,
 '92': 729,
 '93': 729,
 '111': 729,
 '182': 729,
 '195': 729,
 '221': 729,
 '228': 729,
 '284': 729,
 '328': 729,
 '2': 727,
 '3': 727,
 '6': 727,
 '38': 727,
 '41': 727,
 '50': 727,
 '56': 727,
 '60': 727,
 '72': 727,
 '75': 727,
 '81': 727,
 '101': 727,
 '121': 727,
 '125': 727,
 '126': 727,
 '134': 727,
 '136': 727,
 '158': 727,
 '159': 727,
 '178': 727,
 '179': 727,
 '185': 727,
 '189': 727,
 '191': 727,
 '207': 727,
 '216': 727,
 '232': 727,
 '234': 727,
 '235': 727,
 '238': 727,
 '26

In [184]:
def save_cluster_details_for_article_id_to_db(article_id, article_dict: dict, clustering_run_id: str):

    with PostgresDatabaseOperation() as cursor:
        # first truncate the data in live table
        insert_sql = f"""
                        INSERT INTO article_to_cluster_mapping 
                         (article_id, storyline_id, cluster_id, clustering_run_id, storyline_prob, story_cluster_id, max_agg_cluster_id, agg_cluster_prob)
                         VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                    """
        cursor.execute(insert_sql, (
            article_id,
            int(article_dict['storyline_id']),
            int(article_dict['cluster_id']),
            clustering_run_id,
            article_dict.get('storyline_prob', None),
            article_dict.get('story_cluster_id', None),
            article_dict.get('max_agg_cluster_id', None),
            article_dict.get('agg_cluster_prob', None)
        ))
    return

In [177]:
np.argmax(cosine_similarity([sample_article_embeddings['65791e521678087a992964b2']], list(story_embeddings.values())[1:]))

75

In [185]:
cluster_details = get_approximate_cluster_details_for_article(sample_article_embeddings['65791e521678087a992964b2'])

In [186]:
run_id

'TomatoEmeraldCandle'

In [188]:
from src._utils import load_bertopic_model_from_hf

In [190]:
hf_model = load_bertopic_model_from_hf(run_id = run_id)

topics.json:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

topic_embeddings.safetensors:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

ctfidf_config.json:   0%|          | 0.00/16.0M [00:00<?, ?B/s]

ctfidf.safetensors:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

In [193]:
hf_model.find_topics('sales', top_n=30)

([79,
  285,
  -1,
  208,
  76,
  0,
  142,
  43,
  83,
  40,
  288,
  149,
  215,
  88,
  278,
  222,
  65,
  267,
  12,
  118,
  180,
  157,
  163,
  259,
  120,
  344,
  147,
  329,
  3,
  209],
 [0.68069476,
  0.66753304,
  0.66737235,
  0.6551521,
  0.65170205,
  0.6503667,
  0.6493062,
  0.64658105,
  0.645731,
  0.6434041,
  0.64321506,
  0.64306605,
  0.6429286,
  0.6418499,
  0.63972217,
  0.6391914,
  0.63761747,
  0.63719004,
  0.63717055,
  0.6369741,
  0.63653916,
  0.6363202,
  0.63521266,
  0.6328662,
  0.6315385,
  0.6314287,
  0.6300843,
  0.627578,
  0.62686884,
  0.62647474])

In [187]:
save_cluster_details_for_article_id_to_db(article_id='65791e521678087a992964b2',article_dict=cluster_details, clustering_run_id=run_id)

### Conclusion

- By observing the results, it is clear that cosine similarity is a much better measure of finding the closest story