## Defining sagemaker roles

In [1]:
import boto3
iam = boto3.client('iam')
response = iam.list_roles()
sagemaker_roles = [role for role in response['Roles'] if 'SageMaker' in role['RoleName']]
sagemaker_roles[0]['RoleName']

'AmazonSageMaker-ExecutionRole-20231030T210397'

In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20231030T210397')['Role']['Arn']
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml


Couldn't call 'get_role' to get Role ARN from role name username to get Role path.


sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::005418323977:role/service-role/AmazonSageMaker-ExecutionRole-20231030T210397
sagemaker bucket: sagemaker-ap-south-1-005418323977
sagemaker session region: ap-south-1


In [3]:
import pandas as pd
import numpy as np
import json

import re
from transformers import AutoTokenizer
from random import randint
import sys
sys.path.append("../utils")
from pack_dataset import pack_dataset
from datasets import Dataset

import requests

## Data Prep

### Loading the Dataset

### Defining the model

In [4]:
model_id = "teknium/OpenHermes-2.5-Mistral-7B"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### GPT responses EDA

In [5]:
df = pd.read_csv('gpt-4-responses-1-dec-with-headers.csv')

In [6]:
df = df.drop(columns = {'0', '1', 'has_text', 'json_generated'})

In [7]:
df.prompt.nunique()

1

In [8]:
df.head(3)

Unnamed: 0,prompt,full_content,model,attributes,prompt_tokens,completion_tokens,article_id
0,\nYou are the chief editor for a leading India...,Amazon workers walk out over lack of trust in ...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Ar...",2304,458,6555c8194b13023f9349092e
1,\nYou are the chief editor for a leading India...,Laid off by Qualcomm Indian techie on H1B visa...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",1638,470,653169231e5cc42b1b13d2be
2,\nYou are the chief editor for a leading India...,Best Places to Exchange Currency in Los Angele...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",2488,484,6555cc364b13023f934a655c


In [9]:
import ast
res = []
for i in range(len(df)):
    try:
        res.append(json.loads(df.iloc[i]['attributes'])['article_validity_duration'])
    except:
        res.append(ast.literal_eval(json.loads(df.iloc[i]['attributes']))['article_validity_duration'])

In [10]:
df['validity'] = res

In [11]:
df.groupby('validity')['validity'].count()

validity
-1    150
1     195
3     175
7     297
14    110
30    303
90      2
-1     37
1      39
14     32
3      37
30     78
7      60
90      2
Name: validity, dtype: int64

In [15]:
# train_set_df = df.sample(frac = 0.85)
# train_set_df.to_csv('train-set-gpt-4-responses-1-dec.csv', index = False)
# test_set_df = df[~df.article_id.isin(train_set.article_id.unique())]
# test_set_df.to_csv('test-set-gpt-4-responses-1-dec.csv', index = False)

In [12]:
train_set_df = pd.read_csv('train-set-gpt-4-responses-1-dec.csv')

In [13]:
train_set_df

Unnamed: 0,prompt,full_content,model,attributes,prompt_tokens,completion_tokens,article_id,validity
0,\nYou are the chief editor for a leading India...,Tata Motors cars to get costlier from today. D...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",1486,517,65316c4d1e5cc42b1b13e030,3
1,\nYou are the chief editor for a leading India...,Dont have a PAN or Aadhaar number You cannot c...,gpt-4-0613,"{""analysis_of_financial_or_business_news"": ""Th...",1580,500,653169051e5cc42b1b13c9ce,30
2,\nYou are the chief editor for a leading India...,Air services from Kanpur to Delhi will begin s...,gpt-4-0613,"{""analysis_of_financial_or_business_news"": ""Ar...",1545,513,651e12bca662d76276b878ec,30
3,\nYou are the chief editor for a leading India...,Cholamandalam Finance April 2023 NCD Public Is...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",2963,618,652fd3151e5cc42b1b13a507,30
4,\nYou are the chief editor for a leading India...,Survey Shows Real Estate Is Americans Favorite...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",1361,529,6555cc9e4b13023f934aaf30,30
...,...,...,...,...,...,...,...,...
1284,\nYou are the chief editor for a leading India...,Why Would a Company Perform a Reverse Stock Sp...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",2260,484,6555c81b4b13023f93490a5f,-1
1285,\nYou are the chief editor for a leading India...,How To Open Fixed Deposit Account In Post Offi...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",2663,500,652fc74a1e5cc42b1b139f64,30
1286,\nYou are the chief editor for a leading India...,Pizza Hut to continue aggressive expansion spr...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",1950,530,651de972a662d76276b81416,30
1287,\nYou are the chief editor for a leading India...,Key considerations to know in real estate inve...,gpt-4-0613,"{""analysis_is_financial_or_business_news"": ""Th...",2025,475,6555c0904b13023f9348cbde,30


In [14]:
train_set_df['completion_tokens'].describe()

count    1289.000000
mean      506.780450
std        47.484215
min       327.000000
25%       475.000000
50%       504.000000
75%       537.000000
max       675.000000
Name: completion_tokens, dtype: float64

In [16]:
# structuring the format
train_set = {}
for row in train_set_df.itertuples():
    res = row.attributes
    train_set[row.article_id] = {'content': row.full_content, 'response': {}}
    train_set[row.article_id]['attributes'] = json.loads(res)
    # train_set[art_id]['response']['summaries'] = res[1]

In [17]:
assert train_set_df.article_id.nunique() == len(train_set.keys())

In [18]:
def correct_validity_duration(val):
    val = int(val)
    valid_days = [-1, 1, 3, 7, 14, 30]
    if val in valid_days:
        return val
    else:
        for i in valid_days:
            if val > i:
                valid_value = i
        return valid_value

In [19]:
from copy import deepcopy

In [37]:
expected_dict_structure = expected_dict_structure = {"analysis_of_financial_or_business_news": "",
"financial_or_business_news": "",
"analysis_of_relevant_for_india": "",
"relevant_for_india": "",
"analysis_of_article_validity_duration": "",
"article_validity_duration": "",
"analysis_of_popularity": "",
"popularity": "",
"analysis_of_article_type": "",
"article_type": "",
"analysis_of_article_sentiment": "",
"article_sentiment": "",
"headline_suggestion": "",
"first_attempt_summary": "",
"improved_summary": "",
"final_summary": "",
"top_categories": ""}
expected_keys = sorted(list(expected_dict_structure.keys()))

### Cleaning the responses from GPT

In [38]:
malformed = []
cleaned_train_set = {}
for art_id in train_set:
    cleaned_train_set[art_id] = {}
    try:
        attributes = train_set[art_id]['attributes']
        if 'analysis_is_financial_or_business_news' in attributes:
            attributes['analysis_of_financial_or_business_news'] = attributes['analysis_is_financial_or_business_news']
            attributes.pop('analysis_is_financial_or_business_news')
        if 'is_financial_or_business_news' in attributes:
            attributes['financial_or_business_news'] = attributes['is_financial_or_business_news']
            attributes.pop('is_financial_or_business_news')
        keys = sorted(list(attributes.keys()))
        if not keys == expected_keys:
            malformed.append(art_id)
            cleaned_train_set.pop(art_id)
            continue
        cleaned_train_set[art_id] = {'content': train_set[art_id]['content'], 'attributes': deepcopy(attributes)}
        cleaned_train_set[art_id]['attributes']['financial_or_business_news'] = True if bool(attributes['financial_or_business_news']) == 1 else False if bool(attributes['financial_or_business_news']) == 0 else None
        cleaned_train_set[art_id]['attributes']['relevant_for_india'] = True if bool(attributes['relevant_for_india']) == 1 else False if bool(attributes['relevant_for_india']) == 0 else None
        cleaned_train_set[art_id]['attributes']['article_validity_duration'] = correct_validity_duration(attributes['article_validity_duration'])
    except:
        malformed.append(art_id)
        cleaned_train_set.pop(art_id)
        continue

In [28]:
art_id

'651e04b2a662d76276b85800'

In [34]:
set(expected_keys) - set(sorted(list(train_set['651e04b2a662d76276b85800']['attributes'].keys())))

{'analysis_is_financial_or_business_news', 'is_financial_or_business_news'}

In [39]:
set([cleaned_train_set[art_id]['attributes']['financial_or_business_news'] for art_id in cleaned_train_set])

{False, True}

In [40]:
set([cleaned_train_set[art_id]['attributes']['relevant_for_india'] for art_id in cleaned_train_set])

{False, True}

In [41]:
set([cleaned_train_set[art_id]['attributes']['article_validity_duration'] for art_id in cleaned_train_set])

{-1, 1, 3, 7, 14, 30}

In [42]:
set([cleaned_train_set[art_id]['attributes']['article_sentiment'] for art_id in cleaned_train_set])

{'NA', 'bear', 'bearish', 'bull'}

In [43]:
set([cleaned_train_set[art_id]['attributes']['article_type'] for art_id in cleaned_train_set])

{'Fact', 'analysis', 'educational', 'fact', 'factual', 'opinion', 'sponsored'}

In [44]:
set([cleaned_train_set[art_id]['attributes']['popularity'] for art_id in cleaned_train_set])

{'Breaking_news', 'breaking_news', 'moderately_popular', 'niche'}

### casting incorrect values

In [45]:
for art_id in cleaned_train_set:
    attributes = cleaned_train_set[art_id]['attributes']
    for key in ['article_sentiment', 'article_type', 'popularity']:
        attributes[key] = attributes[key].lower()
        if attributes['article_sentiment'] == 'bearish':
            attributes['article_sentiment'] = 'bear'
        if attributes['article_type'] in ('fact', 'Fact', 'factual'):
            attributes['article_type'] = 'fact'
        if attributes['popularity'] in ('Breaking_news'):
            attributes['popularity'] = 'breaking_news'

In [46]:
set([cleaned_train_set[art_id]['attributes']['article_sentiment'] for art_id in cleaned_train_set])

{'bear', 'bull', 'na'}

In [47]:
set([cleaned_train_set[art_id]['attributes']['article_type'] for art_id in cleaned_train_set])

{'analysis', 'educational', 'fact', 'opinion', 'sponsored'}

In [48]:
set([cleaned_train_set[art_id]['attributes']['popularity'] for art_id in cleaned_train_set])

{'breaking_news', 'moderately_popular', 'niche'}

In [49]:
len(cleaned_train_set)

1246

In [50]:
len(train_set)

1287

In [51]:
len(malformed)

41

### Properly ordering the train dict

In [52]:
cleaned_train_set_copy = deepcopy(cleaned_train_set)

In [53]:
expected_dict_structure = {"analysis_of_financial_or_business_news": "",
"financial_or_business_news": "",
"analysis_of_relevant_for_india": "",
"relevant_for_india": "",
"analysis_of_article_validity_duration": "",
"article_validity_duration": "",
"analysis_of_popularity": "",
"popularity": "",
"analysis_of_article_type": "",
"article_type": "",
"analysis_of_article_sentiment": "",
"article_sentiment": "",
"headline_suggestion": "",
"first_attempt_summary": "",
"improved_summary": "",
"final_summary": "",
"top_categories": ""}

In [54]:
expected_dict_structure.keys()

dict_keys(['analysis_of_financial_or_business_news', 'financial_or_business_news', 'analysis_of_relevant_for_india', 'relevant_for_india', 'analysis_of_article_validity_duration', 'article_validity_duration', 'analysis_of_popularity', 'popularity', 'analysis_of_article_type', 'article_type', 'analysis_of_article_sentiment', 'article_sentiment', 'headline_suggestion', 'first_attempt_summary', 'improved_summary', 'final_summary', 'top_categories'])

In [55]:
cleaned_train_set = {article_id: {'content': cleaned_train_set_copy[article_id]['content'], 
                                  'attributes': {k: cleaned_train_set_copy[article_id]['attributes'][k] for k in expected_dict_structure}} for article_id in cleaned_train_set_copy.keys()}

In [56]:
list(cleaned_train_set.keys())[:10]

['65316c4d1e5cc42b1b13e030',
 '653169051e5cc42b1b13c9ce',
 '651e12bca662d76276b878ec',
 '652fd3151e5cc42b1b13a507',
 '6555cc9e4b13023f934aaf30',
 '6555bed64b13023f9348c585',
 '653169401e5cc42b1b13d7aa',
 '651dfa41a662d76276b83da7',
 '65316e9b1e5cc42b1b13e5b9',
 '653169151e5cc42b1b13ce9e']

In [57]:
malformed_top_categories = [article_id for article_id in cleaned_train_set if isinstance(cleaned_train_set[article_id]['attributes']['top_categories'], list)]

In [59]:
cleaned_train_set = {k: cleaned_train_set[k] for k in cleaned_train_set.keys() if k not in malformed_top_categories}

### Article Truncation logic

In [60]:
def calculate_tokens(text, encoder):  # Placeholder for your actual token calculation function
    # Your implementation will go here.
    return len(encoder.encode(text))  # Example: counting characters as tokens

def truncate_text_to_token_limit(text,encoder, token_limit):
    # First, check if the whole text is under the token limit
    if calculate_tokens(text, encoder) <= token_limit:
        return text  # The entire text is within the limit

    def is_under_limit(index):
        # Use the provided function to calculate tokens for the substring
        return calculate_tokens(text[:index], encoder) <= token_limit

    left, right = 0, len(text)
    valid_limit = 0  # This will hold the index of the last valid token position

    # Binary search to find the token limit
    while left <= right:
        mid = (left + right) // 2  # Find the midpoint
        if is_under_limit(mid):
            # If the midpoint is under the limit, store it as a valid limit
            valid_limit = mid
            left = mid + 1  # Move the left boundary to the right
        else:
            right = mid - 1  # Move the right boundary to the left

    # Find the last space before the valid_limit to ensure we're at a word boundary
    space_index = text.rfind(' ', 0, valid_limit)
    if space_index == -1:
        # If there's no space, we've hit the start of the text
        return text[:valid_limit]  # Return up to the valid limit even if mid-word

    # Return the text up to the last word within the token limit
    return text[:space_index]

### Prompt

In [178]:
system_prompt =  '''
You are the chief editor for a leading Indian financial and business news website. You evaluate critical attributes of articles to gate keep content quality. For many attributes, you will first provide a brief analysis of 15 to 30 words, followed by assessment.

1. analysis_of_financial_or_business_news (short text) : <analyse if article pertains to finance/business or not. government policies directly impacting indian corporations or investors are ok, but not if aren't>
2. financial_or_business_news (True or False) : <True or False based on previous attribute>
3. analysis_of_relevant_for_india (short text) : <analyse if article is relevant for indians. for example international articles about 401k or small foreign companies won't be relevant for india. however changes to fed interest rates or nasdaq or important news of large multinational corporations will be relevant>
4. relevant_for_india (True or False) : <True or False based on previous attribute>
5. analysis_of_article_validity_duration (short text) : <analyse relevance duration. Be stingy: Stock fluctuations, 1 day; significant policy changes - few days or a week; educational content with references to any regulations is 30 unless there are none - in which case is timeless. International news in India has shorter lifespan. breaking news are usually not timeless; quarterly analysis or results are usually valid for a 3 days, yearly analysis or results for a weeks and a much longer one for a month.>
6. article_validity_duration (one of 1, 3, 7, 14, 30, -1) : <calculate number of days based on previous attribute. -1: timeless. 1: article is relevant only for that day. 3: for a couple of days. 7: for a week. 14: for a couple of weeks. 30: for a month>
7. analysis_of_popularity (short text) : <analyse likely popularity of article - if its for niche audience, moderate_popularity or should be part of breaking_news section, depending on number of people who will be impacted by the news and the scale of the event. foreign entities known in india but not very popular will be mostly niche or rarely moderately popular. articles targeted to very specific business or pratices will be niche. infotainment business and financial articles with some drama are likely to be more popular. articles with a list of rules without compelling story-telling will be for niche audience>
8. popularity (one of niche, moderately_popular, breaking_news) : <based on previous attribute>
9. analysis_of_article_type (short text) : <analyse if the article is majorly factual, is an opinion piece, analysis, educational or likely sponsored. factual articles pass on information on events. opinion pieces have inferences or predictions either from the author or from statements without data. analysis pieces have substantial data to justify their inferences or predictions. if an article is overly zealous on certain stock and seems like an ad, then it is sponsored>
10. article_type (one of fact, opinion, analysis, educational, sponsored) : <based on previous attribute>
11. analysis_of_article_sentiment (short_text): <analyse if the sentiment of the article is bullish, bearish or NA. balanced is NA>
12. article_sentiment (one of bull, bear, na): <based on previous attribute>
13. headline_suggestion (short text) : <Write a headline based on the content of the article>
14. first_attempt_summary (text of 60 words) : <Generate concise, entity-dense summary. The summary should become highly dense but easily understood without the Article. limit it to no more than 60 words>
15. improved_summary (text of 60 words): <Identify contents of the article which are missing from the previous summary but are important part of the article>
16. final_summary (text of 60 words): <The finalised summary which is a mixture of first_attempt_summary and improved_summary. Don't critique the summary. This summary should be very concise but also all the important information of the article and yet concise.>
17. top_categories (5 semi colon seperated categories): <Hierarchy of 5 categories to which this article belongs. Start with a generic category and make it progressively specific. Select only 5 and seperate them by (;). Don't use either single or double quotes at any cost to avoid json.loads() failure>

your response should be a json structure with all the 17 above keys without missing any key. It is very important that the response is directly readable with json.loads(). no preamble or postamble. respond in the exact following structure:

{
"analysis_of_financial_or_business_news": "",
"financial_or_business_news": "",
"analysis_of_relevant_for_india": "",
"relevant_for_india": "",
"analysis_of_article_validity_duration": "",
"article_validity_duration": "",
"analysis_of_popularity": "",
"popularity": "",
"analysis_of_article_type": "",
"article_type": "",
"analysis_of_article_sentiment": "",
"article_sentiment": "",
"headline_suggestion": "",
"first_attempt_summary": "",
"improved_summary": "",
"final_summary": "",
"top_categories": ""
}
'''

### Article Size distribution

In [62]:
article_sizes = {k: len(tokenizer.encode(cleaned_train_set[k]['content'])) for k in cleaned_train_set}

In [63]:
article_sizes

{'65316c4d1e5cc42b1b13e030': 401,
 '653169051e5cc42b1b13c9ce': 485,
 '651e12bca662d76276b878ec': 475,
 '652fd3151e5cc42b1b13a507': 2323,
 '6555cc9e4b13023f934aaf30': 251,
 '6555bed64b13023f9348c585': 1994,
 '653169401e5cc42b1b13d7aa': 506,
 '651dfa41a662d76276b83da7': 476,
 '65316e9b1e5cc42b1b13e5b9': 1488,
 '653169151e5cc42b1b13ce9e': 516,
 '651e0776a662d76276b85eac': 512,
 '652ebbdb1e5cc42b1b139b67': 837,
 '6555ccd54b13023f934adbf9': 374,
 '655c20fc4b13023f934af5cc': 440,
 '651dcd68a662d76276b7c9b2': 587,
 '653680111e5cc42b1b143f48': 1194,
 '65316f731e5cc42b1b1400aa': 1593,
 '65316f411e5cc42b1b13f5ed': 518,
 '651e0a35a662d76276b864e1': 1008,
 '6555ccc64b13023f934acf58': 184,
 '6555c84a4b13023f934924d6': 497,
 '6555cc6c4b13023f934a8d15': 544,
 '651dfeeaa662d76276b84929': 323,
 '6555c8144b13023f934905df': 459,
 '6531687f1e5cc42b1b13add0': 604,
 '651de5aaa662d76276b80a31': 434,
 '651e18d3a662d76276b88714': 649,
 '65316f6c1e5cc42b1b13ff04': 1602,
 '651dd58fa662d76276b7e02c': 272,
 '6555c

#### Finding the token limit for articles

In [75]:
message_template = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": f" '|article_start|\n'{''}'\n|article_end|'\n"}]

OUTPUT_TOKEN_LIMIT = 1024 # set based on the distribution of completion tokens from gpt4
INSTRUCTION_TOKENS = len(tokenizer.apply_chat_template(message_template, add_generation_prompt=True))
BUFFER_TOKENS = 10
ARTICLE_TOKEN_LIMIT = 4096 - OUTPUT_TOKEN_LIMIT - INSTRUCTION_TOKENS - BUFFER_TOKENS
ARTICLE_TOKEN_LIMIT

1733

In [67]:
train_df = pd.DataFrame.from_dict(cleaned_train_set).T

In [68]:
train_df['attributes'].sample(10)

65316a3a1e5cc42b1b13dba4    {'analysis_of_financial_or_business_news': 'Th...
6555cb8b4b13023f9349ecdd    {'analysis_of_financial_or_business_news': 'Th...
653168771e5cc42b1b13acc1    {'analysis_of_financial_or_business_news': 'Th...
651dfeb6a662d76276b848ae    {'analysis_of_financial_or_business_news': 'Th...
655c20fc4b13023f934af5cc    {'analysis_of_financial_or_business_news': 'Th...
6555c16b4b13023f9348cf85    {'analysis_of_financial_or_business_news': 'Th...
6555c7fb4b13023f9348f68b    {'analysis_of_financial_or_business_news': 'Th...
651df698a662d76276b83604    {'analysis_of_financial_or_business_news': 'Th...
6555c7fd4b13023f9348f71d    {'analysis_of_financial_or_business_news': 'Th...
651dc7a3a662d76276b7b9f1    {'analysis_of_financial_or_business_news': 'Th...
Name: attributes, dtype: object

In [70]:
dataset = Dataset.from_pandas(train_df)

### Setting up prompt in ChatML Format

In [78]:
def format_text_response_as_prompt(train_row):
    truncated_content = truncate_text_to_token_limit(text=train_row['content'], encoder=tokenizer, token_limit=ARTICLE_TOKEN_LIMIT)
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{truncated_content}\n"}]
    context_prompt = tokenizer.decode(tokenizer.apply_chat_template(messages, add_generation_prompt=True))
    prompt = context_prompt + json.dumps(train_row['attributes'])
    prompt = re.sub(r'\n+','\n',prompt)
    return prompt

In [79]:
format_text_response_as_prompt(train_df.iloc[0])

'<|im_start|> system\nYou are the chief editor for a leading Indian financial and business news website. You evaluate critical attributes of articles to gate keep content quality. For many attributes, you will first provide a brief analysis of 15 to 30 words, followed by assessment.\n1. analysis_of_financial_or_business_news (short text) : <analyse if article pertains to finance/business or not. government policies directly impacting indian corporations or investors are ok, but not if aren\'t>\n2. financial_or_business_news (True or False) : <True or False based on previous attribute>\n3. analysis_of_relevant_for_india (short text) : <analyse if article is relevant for indians. for example international articles about 401k or small foreign companies won\'t be relevant for india. however changes to fed interest rates or nasdaq or important news of large multinational corporations will be relevant>\n4. relevant_for_india (True or False) : <True or False based on previous attribute>\n5. a

### Create chunks

In [82]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [80]:
# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_text_response_as_prompt(sample)}{tokenizer.eos_token}"
    return sample

dataset = dataset.map(template_dataset)
# tokenize dataset
dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
)
# chunk dataset
lm_dataset = pack_dataset(dataset, chunk_length=4096) # We use 4096 as the maximum length for packing
print(f"Total number of samples: {len(lm_dataset)}")

Map:   0%|          | 0/1241 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/1241 [00:00<?, ? examples/s]

Chunking dataset into chunks of 4096 tokens.


Map:   0%|          | 0/1241 [00:00<?, ? examples/s]

Total number of samples: 821
Total number of samples: 821


In [None]:
tokenizer.decode(lm_dataset[0]['input_ids'])

'<s><|im_start|>  system\nYou are the chief editor for a leading Indian financial and business news website. You evaluate critical attributes of articles to gate keep content quality. For many attributes, you will first provide a brief analysis of 15 to 30 words, followed by assessment.\n1. analysis_of_financial_or_business_news (short text) : <analyse if article pertains to finance/business or not. government policies directly impacting indian corporations or investors are ok, but not if aren\'t>\n2. financial_or_business_news (True or False) : <True or False based on previous attribute>\n3. analysis_of_relevant_for_india (short text) : <analyse if article is relevant for indians. for example international articles about 401k or small foreign companies won\'t be relevant for india. however changes to fed interest rates or nasdaq or important news of large multinational corporations will be relevant>\n4. relevant_for_india (True or False) : <True or False based on previous attribute>\n

In [93]:
with open('words.txt','r') as f:
    words = f.read()

In [96]:
words = words.split('\n')

In [101]:
finetune_id = ''.join(np.random.choice(words, 3))

In [103]:
from datetime import datetime

In [111]:
datetime.strftime(datetime.today(),'%Y-%m-%d')

'2023-12-03'

### Saving training data to s3

In [120]:
finetune_dataset_config = {'finetune_id': finetune_id,
                          'date': datetime.strftime(datetime.today(),'%Y-%m-%d'),
                          'num_datapoints': len(train_set),
                            'data_source': 'gpt4',
                          'prompt': system_prompt}

In [115]:
finetune_dataset_config['finetune_id']

'WatermelonSapphireZipline'

In [123]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/fine_tuning_datasets/{finetune_dataset_config["date"]}-{finetune_dataset_config["finetune_id"]}'
lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

Saving the dataset (0/1 shards):   0%|          | 0/821 [00:00<?, ? examples/s]

uploaded data to:
training dataset to: s3://sagemaker-ap-south-1-005418323977/fine_tuning_datasets/2023-12-03-WatermelonSapphireZipline


In [124]:
lm_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 821
})

### Hyperparamters

In [132]:
from huggingface_hub import HfFolder


# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 1,                            # number of training epochs
  'per_device_train_batch_size': 3,                 # batch size for training
  'gradient_accumulation_steps': 2,                 # Number of updates steps to accumulate
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 2e-4,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"cosine_with_restarts",                   # learning rate scheduler
  'save_strategy': "epoch",                         # save strategy for checkpoints
  "logging_steps": 10,                              # log every x steps
  'merge_adapters': True,                           # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'output_dir': '/tmp/run',                         # output directory, where to save assets during training
                                                    # could be used for checkpointing. The final trained
                                                    # model will always be saved to s3 at the end of training
}

if HfFolder.get_token() is not None:
    hyperparameters['hf_token'] = HfFolder.get_token() # huggingface token to access gated models, e.g. llama 2

In [133]:
from sagemaker.huggingface import HuggingFace

# define Training Job Name
job_name = f'huggingface-qlora-{hyperparameters["model_id"].replace("/","-").replace(".","-")}-{finetune_dataset_config["finetune_id"]}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_qlora.py',    # train script
    source_dir           = '../utils/',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 6*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    disable_output_compression = True         # not compress output to save training time and cost
)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml


### Training job

In [134]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-qlora-teknium-OpenHermes-2--2023-12-03-07-38-05-843


Using provided s3_resource
2023-12-03 07:38:07 Starting - Starting the training job...
2023-12-03 07:38:21 Starting - Preparing the instances for training......
2023-12-03 07:39:28 Downloading - Downloading input data...
2023-12-03 07:39:48 Training - Downloading the training image.....................
2023-12-03 07:43:39 Training - Training image download completed. Training in progress.......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-12-03 07:44:38,999 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-12-03 07:44:39,017 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-12-03 07:44:39,026 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-12-03 07:44:39,028 sagemaker_pytorch_container.training INFO     Invoking user training script.

### Deployment

In [135]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

INFO:sagemaker.image_uris:Defaulting to only available Python version: py39
INFO:sagemaker.image_uris:Defaulting to only supported image scope: gpu.


llm image uri: 763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04


In [136]:
llm_image

'763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'

In [137]:
role

'arn:aws:iam::005418323977:role/service-role/AmazonSageMaker-ExecutionRole-20231030T210397'

#### Download model to local

In [335]:
import boto3
import os

# Initialize a boto3 S3 client
s3 = boto3.client('s3')

In [337]:
llm_model.name

'huggingface-pytorch-tgi-inference-2023-12-03-10-21-58-396'

In [343]:
import boto3
import os

# Initialize a boto3 S3 client
s3 = boto3.client('s3')

# S3 bucket and folder details
bucket_name = 'sagemaker-ap-south-1-005418323977'
s3_folder = 's3://sagemaker-ap-south-1-005418323977/huggingface-qlora-teknium-OpenHermes-2--2023-12-03-07-38-05-843'

# Local directory to save files
local_folder = './OpenHermes_WatermelonSapphireZipline/'

# List objects within the specified S3 folder
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)

# Download each file in the folder
for obj in objects.get('Contents', []):
    s3_file_path = obj['Key']
    local_file_path = os.path.join(local_folder, s3_file_path[len(s3_folder):])
    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
    s3.download_file(bucket_name, s3_file_path, local_file_path)
    print(f'Downloaded {s3_file_path} to {local_file_path}')
# Remember to replace 'your-bucket-name', 'your-folder-name/', and 'path/to/local/folder/' with your actual bucket name, S3 folder, and local folder path.


In [64]:
import transformers

In [65]:
from transformers import AutoModel, AutoConfig
from huggingface_hub import HfFolder

In [66]:
import transformers

In [67]:
transformers.__version__

'4.35.2'

In [12]:
!pip install --upgrade git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /private/var/folders/d4/cgyr_gnj7nn2wy_hq40gkq8c0000gq/T/pip-req-build-fvqwwsub
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /private/var/folders/d4/cgyr_gnj7nn2wy_hq40gkq8c0000gq/T/pip-req-build-fvqwwsub
  Resolved https://github.com/huggingface/transformers to commit 3bc50d81e6c70d63e59d635106bac6a561b47681
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.36.0.dev0-py3-none-any.whl size=8048947 sha256=38109c4d4a4b05baf5d143f483a397e952fdded8c2a4b4dc9c36e75e3483b40d
  Stored in directory: /private/var/folders/d4/cgyr_gnj7nn2

In [4]:
!pip install -U transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [68]:
import torch

### Upload to HF

In [69]:
model = AutoModel.from_pretrained('./hermes_finetuned_model/', local_files_only = True, torch_dtype = torch.float16)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [70]:
model.push_to_hub('WintWealth/partial_finetuned_open_hermes_2.5', token = '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', private=True)

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.28G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/WintWealth/partial_finetuned_open_hermes_2.5/commit/e48ca5f4ad711b31dfa6ea52d45a0e548e5a9adb', commit_message='Upload model', commit_description='', oid='e48ca5f4ad711b31dfa6ea52d45a0e548e5a9adb', pr_url=None, pr_revision=None, pr_num=None)

### Deploy

In [286]:
4096 - 768

3328

In [341]:
model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

In [342]:
model_s3_path

's3://sagemaker-ap-south-1-005418323977/huggingface-qlora-teknium-OpenHermes-2--2023-12-03-07-38-05-843/output/model/'

In [349]:
import json
from sagemaker.huggingface import HuggingFaceModel

# s3 path where the model will be uploaded
# if you try to deploy the model to a different time add the s3 path here
# model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]
model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

# sagemaker config
instance_type = "ml.g5.xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(3072), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096), # Max length of the generation (including input text)
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml


In [350]:

# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=6,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

INFO:sagemaker:Creating model with name: huggingface-pytorch-tgi-inference-2023-12-06-06-56-59-809
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-tgi-inference-2023-12-06-06-57-00-520
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-tgi-inference-2023-12-06-06-57-00-520


------------!

In [351]:
endpoint_name = llm.endpoint_name

In [352]:
endpoint_name

'huggingface-pytorch-tgi-inference-2023-12-06-06-57-00-520'

In [175]:
def format_article_for_prompt(article_text):
    truncated_content = truncate_text_to_token_limit(text=article_text, encoder=tokenizer, token_limit=ARTICLE_TOKEN_LIMIT)
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{truncated_content}\n"}]
    context_prompt = tokenizer.decode(tokenizer.apply_chat_template(messages, add_generation_prompt=True))
    prompt = re.sub(r'\n+','\n',context_prompt)
    return prompt

#### Testing for a few articles

In [182]:
parent_folder = '/Users/username/Desktop/ML/Recommendations/arcane/'
from hydra import compose, initialize
import os

import xml.etree.ElementTree as ET

tree = ET.parse('../../../conf/application.run.xml')
root = tree.getroot()

envs_element = root.find('./configuration/envs')
for variable in envs_element.findall('env'):
    name = variable.get('name')
    value = variable.get('value')
    os.environ[name] = value

import sys
sys.path.append('/Users/username/Desktop/ML/Recommendations/arcane/')

from src._utils import load_bertopic_model_from_hf

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [183]:
from src.articles.ArticleService import ArticleService

In [187]:
art = ArticleService.get_article_json_from_s3_and_api('651e1c48a662d76276b88eee')

art['title'] + art['cleaned_text']

In [309]:
import boto3
from botocore.config import Config

# Configure the max_pool_connections
my_config = Config(
    region_name = 'ap-south-1',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    },
    max_pool_connections = 40  # Increase the pool size
)

# Create a SageMaker Runtime client with the custom configuration
sess1 = boto3.session.Session()
smr1 = sess1.client("sagemaker-runtime", config=my_config)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [188]:
smr = sess.boto_session.client("sagemaker-runtime")

In [189]:
parameters = {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["###", "</s>", tokenizer.eos_token],
}

In [190]:
import re

In [191]:
art = ArticleService.get_article_json_from_s3_and_api('651e1c48a662d76276b88eee')
content = art['title'] + art['cleaned_text']

In [192]:
content

"Chelsea vs Real Madrid: Live streaming, where to watch Champions League matchChelsea will play Real Madrid in a crucial Champions League quarterfinal second-leg on April 18 at Stamford Bridge. The first-leg in Madrid saw the Blues suffer a 2-0 defeat, as Karim Benzema and Marco Asensio scored to give the defending champions a comfortable win at Santiago Bernabeu.For Chelsea, the return leg marks a pivotal moment in their season, with a Champions League exit ending their hopes of silverware. However, the team under Frank Lampard's management has struggled lately, with three consecutive defeats, making their task even more daunting.Fortunately, fans worldwide will have the chance to stream the game online. The Sporting News has provided all the necessary details on how to do so.Chelsea vs Real Madrid Live StreamChampions League streaming coverage will be offered by beIN Sports in Singapore, Hong Kong, and Malaysia, while JioTV and Sony LIV will be the streaming options available in Indi

In [193]:
request = {"inputs": format_article_for_prompt(content), "parameters": parameters, "stream": False}

In [194]:
resp = smr.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(request),
    ContentType="application/json",
)

In [195]:
k = resp['Body'].read()

In [196]:
json.loads(json.loads(k)[0]['generated_text'])

{'analysis_of_article_sentiment': 'The sentiment is neutral as it presents a factual report of the upcoming match.',
 'analysis_of_article_type': 'The article is factual, providing information about the upcoming Champions League game between Chelsea and Real Madrid.',
 'analysis_of_article_validity_duration': "The article contains information about an upcoming game, so it's valid until the event happens. After that, it's irrelevant.",
 'analysis_of_financial_or_business_news': 'The article is not directly related to finance or business, but it discusses a high-profile football match and provides information about streaming options.',
 'analysis_of_popularity': 'The article would be popular among football fans and sports enthusiasts, especially those interested in the Champions League.',
 'analysis_of_relevant_for_india': 'This is a Champions League football match between Chelsea and Real Madrid, which might be of interest to Indian football fans. It will be broadcast on Sony LIV.',
 'a

In [275]:
finetune_id

'WatermelonSapphireZipline'

In [273]:
endpoint_name

'huggingface-pytorch-tgi-inference-2023-12-03-10-21-59-120'

In [197]:
test_articles = ['651de546a662d76276b80931',
 '651e2409a662d76276b8a181',
 '651dfe88a662d76276b84838',
 '651e0d14a662d76276b86b88',
 '651e2286a662d76276b89de1',
 '656719c9207bb033b910ea11',
 '651df8e1a662d76276b83a66',
 '651dca8aa662d76276b7c1f8',
 '651e0f43a662d76276b870aa',
 '6566a94f39bfc8784efe5b7c',
 '651e1c48a662d76276b88eee',
 '651dc511a662d76276b7b33e',
 '6555c17a4b13023f9348cfc5',
 '651de912a662d76276b81322',
 '6555c1f84b13023f9348d104',
 '65683309207bb033b910ed9e',
 '656557db1fc4586a032ad63e',
 '651e093aa662d76276b862ab',
 '651dd514a662d76276b7df16',
 '651e14c8a662d76276b87d8a']

In [276]:
new_endpoint_name = 'OpenHermes-Finetune-WatermelonSapphireZipline'

In [277]:
y, z = get_attributes_from_llm('6555c17a4b13023f9348cfc5')

In [310]:
def get_attributes_from_llm(article_id):
    try:
        article = ArticleService.get_article_json_from_s3_and_api(article_id)
    except:
        return article_id, ''
    full_content = article['title'] + article['cleaned_text']
    full_input = format_article_for_prompt(full_content)
    request = {"inputs": full_input, "parameters": parameters, "stream": False}
    try:
        resp = smr1.invoke_endpoint(
            EndpointName=new_endpoint_name,
            Body=json.dumps(request),
            ContentType="application/json",
            )
        k = resp['Body'].read()
        try:
            attributes = json.loads(json.loads(k)[0]['generated_text'])
            return article_id, attributes
        except:
            return article_id, ''
    except:
        print(article_id, 'some error')
        article_id, ''

In [None]:
Binary.createFromBase64('W3siZ2VuZXJhdGVkX3RleHQiOiJ7XCJhbmFseXNpc19vZl9hcnRpY2xlX3NlbnRpbWVudFwiOiBcIlRoZSBzZW50aW1lbnQgaXMg…', 0)

In [200]:
import time

In [202]:
from concurrent.futures import ThreadPoolExecutor, as_completed

In [203]:
start_time = time.time()
responses = {}
valid_attributes = {}
invalid_response_ids = []

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(get_attributes_from_llm, art_id) for art_id in test_articles]
    for future in as_completed(futures):
        art_id, response = future.result()
        responses[art_id] = response
            # invalid_response_ids.append(art_id)

print(f'Processed {len(test_articles)} IDs in {time.time() - start_time} seconds')

Processed 20 IDs in 93.18964195251465 seconds


### Running the results for all the articles

In [328]:
from sql.PostgresDatabaseOperation import PostgresDatabaseOperation

with PostgresDatabaseOperation() as cursor:
    sql = 'SELECT DISTINCT article_id FROM embeddings'
    cursor.execute(sql)
    results = cursor.fetchall()

all_article_ids = [x[0] for x in results]

In [329]:
len(all_article_ids)

118008

In [330]:
chunk_size = 100

In [331]:
num_chunks = (len(all_article_ids)//chunk_size) + 1

In [266]:
pd.DataFrame(columns = ['article_id', 'attributes']).to_csv(f'all_article_attributes_{finetune_id}.csv',index = False)

In [270]:
i

17

In [272]:
len('OpenHermes_WatermelonSapphireZipline')

36

In [271]:
finetune_id

'WatermelonSapphireZipline'

In [314]:
from datetime import datetime

In [321]:
i

107

In [333]:
np.random.choice(all_article_ids)

'652fd0311e5cc42b1b13a398'

In [334]:
get_attributes_from_llm('652fd0311e5cc42b1b13a398')

('652fd0311e5cc42b1b13a398', '')

In [None]:
for i in range(30, num_chunks):
    cur_article_ids = all_article_ids[i * chunk_size: (i+1)*chunk_size]
    start_time = time.time()
    completed = 0
    with ThreadPoolExecutor(max_workers=40) as executor:
        futures = [executor.submit(get_attributes_from_llm, art_id) for art_id in cur_article_ids]
        for future in as_completed(futures):
            if future.result():
                art_id, response = future.result()
                if response:
                    pd.DataFrame([(art_id, json.dumps(response))]).to_csv('all_article_attributes_WatermelonSapphireZipline.csv',mode='a',index=False,header = False)
                    completed += 1
            # invalid_response_ids.append(art_id)
    print(f'Processed {completed} IDs in {time.time() - start_time} seconds in chunk number {i} at {datetime.now()}')

In [None]:
def get_attributes_for_article_ids(article_ids, max_retries=0, retry_count=0):
    start_time = time.time()
    responses = {}
    valid_attributes = {}
    invalid_response_ids = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(get_attributes_from_llm, art_id) for art_id in article_ids]
        print(f'completed in {time.time() - start_time} seconds')
        for future in as_completed(futures):
            art_id, response = future.result()
            if response is None:  # Timeout or other errors
                invalid_response_ids.append(art_id)
                continue

            responses[art_id] = response
            try:
                valid_attributes[art_id] = json.loads(response['llm_attributes'])
            except:
                return response
                # invalid_response_ids.append(art_id)

    print(f'Attempt {retry_count + 1}: Processed {len(article_ids)} IDs in {time.time() - start_time} seconds')

    if invalid_response_ids and retry_count < max_retries:
        print(f'Retrying for {len(invalid_response_ids)} invalid IDs')
        valid_attributes_retry = get_attributes_for_article_ids(invalid_response_ids, max_retries, retry_count + 1)
        valid_attributes.update(valid_attributes_retry)

    return valid_attributes

In [146]:
txt = '''
The article highlights the challenges faced by the Indian stock market in light of global economic issues. Despite India's strong economy and positive corporate earnings growth, foreign institutional investors (FIIs) are selling their shares due to the global economic deceleration. The contraction in the global economy, led by Europe's recession and China's decelerating economy, is affecting the Indian market. The recent fall in the stock market is attributed to global factors, not intrinsic structural issues within India. The article concludes by stating that despite the challenges, the Indian stock market has maintained a high valuation, which is leading to consolidation in prices and valuations
'''

### Delete endpoint