In [None]:
!pip install newscatcherapi-python-sdk==6.0.2 -q
%pip install langchain==0.1.11

In [3]:
import pandas as pd
from pprint import pprint
from newscatcherapi_client import Newscatcher, ApiException
import json
import os
import sys
import botocore
import boto3
from langchain.llms.bedrock import Bedrock
import re


def llm_news_generator():

    newscatcher = Newscatcher(
        api_key="INSERT API KEY",
    )

    try:
        get_response = newscatcher.search.get(
            q="flu",
            search_in="title_content",
            clustering_enabled=True,
            from_='7 day ago',
        )
    except ApiException as e:
        print("Exception when calling AuthorsApi.get: %s\n" % e)
        if e.status == 422:
            pprint(e.body["detail"])
        pprint(e.headers)
        pprint(e.status)
        pprint(e.reason)
        pprint(e.round_trip_time)

    data = []

    def remove_duplicates(s):
        return ' '.join(sorted(set(s.split())))

    for j in range(len(get_response.clusters)):
        
        test_data = get_response.clusters[j]

        all_cluster_text = ""
        country = ""

        for i in range(len(test_data.articles)):

            description = test_data.articles[i]['description']
            if description is not None:
                all_cluster_text += description
            else:
                all_cluster_text += test_data.articles[i]['title']

            country_link =  test_data.articles[i]['country']
            if country_link is not None:
                country += " " + country_link
            else:
                country += ' None'

        country = remove_duplicates(country.strip())

        # Append to data list
        data.append({'Country': country.strip(), 'ClusterText': all_cluster_text.strip()})

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)

    # Bedrock
    boto3_bedrock = boto3.client('bedrock')
    bedrock_runtime = boto3.client('bedrock-runtime')

    prompt = """

    Human: Given the description of illness/diseases provided , please read it and analyse the contents.

    Please extract the following information from the email:
    - Whether the Illness or diseases is an outbreak or not, return it inside <outbreak></outbreak> XML tags usinfg TRUE or FALSE, TRUE is it is an outbreak, FALSE if it is not. Return Unsure, of not sure.
    - TThe number of cases reported from the description in HUMANS <cases></cases> XML tags as a number.
    - The name of the illness from the description <name></name> XML tags.
    - The sentiment on whether the article is negative or neutral <sentiment></sentiment> XML tags.
    - The regions where the incident occured, by states/cities <region></region> XML tags.
    - TThe number of deaths reported from the description in HUMANS <deaths></deaths> XML tags as a number.
    - The description of the disease/illness in english <description></description> XML tags.


    If a particular bit of information is not present, return an empty string.
    Make sure that each question can be understoon by itself, incorporate context if requred.
    Each returned question should be concise, remove extra information if possible.
    The dontext will be given between <context></context> XML tags.

    <context>
    {data_test}
    </context>

    Return the outbreak inside <outbreak></outbreak> XML tags.
    Return the number of cases in HUMANS inside <cases></cases> XML tags.
    Return the name of the illness or diseases inside <name></name> XML tags.
    Return the sentiment of the context inside <sentiment></sentiment> XML tags.
    Return the name of the region inside <region></region> XML tags.
    Return the number of deaths in HUMANS inside <deaths></deaths> XML tags.
    Return the description of the context inside <description></description> XML tags.


    Assistant:"""

    df = df[df['ClusterText']!=""].reset_index(drop=True)

    data_llm = []

    # - create the Anthropic Model
    llm = Bedrock(
        model_id="anthropic.claude-v2",
        client=bedrock_runtime,
        model_kwargs={
            "max_tokens_to_sample": 2000,
            "temperature": 0, # Using 0 to get reproducible results
            "stop_sequences": ["\n\nHuman:"]
        }
    )

    for i in range(len(df)):    
        query = prompt.format(data_test=df['ClusterText'][i])
        result = llm(query).strip()
        

        # Define regular expressions to extract data
        patterns = {
            'outbreak': r'<outbreak>(.*?)</outbreak>',
            'cases': r'<cases>(.*?)</cases>',
            'name': r'<name>(.*?)</name>',
            'sentiment': r'<sentiment>(.*?)</sentiment>',
            'region': r'<region>(.*?)</region>',
            'description': r'<description>(.*?)</description>',
            'deaths': r'<deaths>(.*?)</deaths>'
        }

        # Extract data using regular expressions
        data_dict = {key: re.findall(pattern, result, re.DOTALL)[0] for key, pattern in patterns.items()}

        # Create a DataFrame from the dictionary
        df1 = pd.DataFrame([data_dict])

        data_llm.append(data_dict)

    df1 = pd.DataFrame(data_llm)

    #df1 = pd.DataFrame(data_llm)
    df_final = pd.concat([df,df1], axis=1)

    return(df_final)



In [6]:
# Extract Flu Outbreaks
output = llm_news_generator()
output.to_csv('news_output.csv')
outbreaks = output[output['outbreak']=='TRUE']
outbreaks = outbreaks.drop_duplicates(subset='name').reset_index()

In [39]:
# Entity extraction, sentiment analysis etr on outbreaks
df_outbreak_final = pd.DataFrame()
for j in range(len(outbreaks)):

    newscatcher = Newscatcher(
            api_key="INSERT API KEY",
        )
    try:
        # [Get] Search By Author Request
        get_response = newscatcher.search.get(
            q=outbreaks.name[j],
            search_in="title_content",
            #clustering_enabled=True,
            from_='14 day ago',
        )
        #print(get_response)
    except ApiException as e:
        print("Exception when calling AuthorsApi.get: %s\n" % e)
        #pprint(e.body)
        if e.status == 422:
            pprint(e.body["detail"])
        pprint(e.headers)
        pprint(e.status)
        pprint(e.reason)
        pprint(e.round_trip_time)

        prompt_detail = """

        Human: Given the description of illness/diseases provided, please read it and analyse the contents.

        Please extract the following information from the email:
        - TThe number of cases reported from the description in HUMANS <cases></cases> XML tags as a number.
        - The sentiment on whether the article is positive, negative or neutral <sentiment></sentiment> XML tags.
        - The regions where the incident occured, by states/cities <region></region> XML tags.
        - TThe number of deaths reported from the description in HUMANS <deaths></deaths> XML tags as a number.
        - The description of the disease/illness in english <description></description> XML tags.
        - A very brief description of the impacts caused by the disease such as closures, bans or similar items <impacts></impacts> XML tags.
        - A very brief description of the impact on business caused by the disease <impacts_business></impacts_business> XML tags.


        If a particular bit of information is not present, return an empty string.
        Make sure that each question can be understoon by itself, incorporate context if requred.
        Each returned question should be concise, remove extra information if possible.
        The dontext will be given between <context></context> XML tags.

        <context>
        {data_test}
        </context>

        Return the number of cases in HUMANS inside <cases></cases> XML tags.
        Return the sentiment of the context inside <sentiment></sentiment> XML tags.
        Return the name of the region inside <region></region> XML tags.
        Return the number of deaths in HUMANS inside <deaths></deaths> XML tags.
        Return the description of the context inside <description></description> XML tags.
        Return the impacts of the disease/illness inside <impacts></impacts> XML tags.
        Return the impact on trade/infrastructure/business caused by the disease inside <impacts_business></impacts_business> XML tags.

        Assistant:"""

        outbreak_1 = pd.DataFrame(get_response.articles)
        
        boto3_bedrock = boto3.client('bedrock')
        bedrock_runtime = boto3.client('bedrock-runtime')
        # - create the Anthropic Model
        llm = Bedrock(
            model_id="anthropic.claude-v2",
            client=bedrock_runtime,
            model_kwargs={
                #"max_tokens_to_sample": 200000,
                "temperature": 0, # Using 0 to get reproducible results
                "stop_sequences": ["\n\nHuman:"]
            }
        )

        data_llm_oubreak = []

        for i in range(len(outbreak_1)):    
            
            query = prompt_detail.format(data_test=outbreak_1['content'][i])
            result = llm(query).strip()
            

            # Define regular expressions to extract data
            patterns = {
                'cases': r'<cases>(.*?)</cases>',
                'sentiment': r'<sentiment>(.*?)</sentiment>',
                'region': r'<region>(.*?)</region>',
                'description': r'<description>(.*?)</description>',
                'deaths': r'<deaths>(.*?)</deaths>',
                'impacts': r'<impacts>(.*?)</impacts>',
                'impacts_business': r'<impacts_business>(.*?)</impacts_business>'

            }

            data_dict = {}

            # Extract data using regular expressions
            for key, pattern in patterns.items():
                match = re.findall(pattern, result, re.DOTALL)
                if match:
                    data_dict[key] = match[0]
                else:
                    data_dict[key] = None

            # Create a DataFrame from the dictionary
            df2 = pd.DataFrame([data_dict])

            data_llm_oubreak.append(data_dict)

        df2 = pd.DataFrame(data_llm_oubreak)
        
        df4 = pd.DataFrame(data_llm_oubreak)

        df_outbreak1 = pd.concat([outbreak_1,df4], axis=1)

        df_outbreak1 = df_outbreak1.dropna(subset='sentiment')

        impacts = df_outbreak1['impacts'][0]
        impacts_business = df_outbreak1[df_outbreak1['impacts_business']!=''].reset_index()['impacts_business'][0]
        top_article = df_outbreak1['title'][0]
        negative_sentiment = round(len(df_outbreak1[df_outbreak1['sentiment'].str.lower() == 'negative'])/len(df_outbreak1),2)*100
        outbreak_name = outbreaks.name[j]

        df_outbreak_final_dict = {'outbreak_name': [outbreak_name],'impacts': [impacts], 'impacts_business': [impacts_business], 'top_article': [top_article], 'negative_sentiment': [negative_sentiment]}

    outbreak_output = pd.concat([df_outbreak_final, pd.DataFrame(df_outbreak_final_dict)], ignore_index=True)
    #Save
    outbreak_output.to_csv('llm_news_detail.csv')

In [None]:
%pip install fuzzywuzzy
from fuzzywuzzy import fuzz

In [None]:
# Function to compute similarity between descriptions
def compute_similarity(desc1, desc2):
    return fuzz.ratio(desc1.lower(), desc2.lower())

# Grouping descriptions based on similarity
grouped_descriptions = {}

for desc1 in df_outbreak1['description']:
    found_group = False
    for group in grouped_descriptions:
        for desc2 in group:
            if compute_similarity(desc1, desc2) > 80:  # You can adjust this threshold
                grouped_descriptions[group].append(desc1)
                found_group = True
                break
        if found_group:
            break
    if not found_group:
        grouped_descriptions[tuple([desc1])] = [desc1]

# Create a DataFrame from grouped descriptions
grouped_data = pd.DataFrame([(desc_group[0], ', '.join(desc_group), len(desc_group)) for desc_group in grouped_descriptions.values()], columns=['representative_description', 'grouped_descriptions', 'count'])

# Calculate the percentage of negative sentiments for each group
negative_percentages = []

for group in grouped_data['grouped_descriptions']:
    sentiments = [df_outbreak1.loc[df_outbreak1['description'] == desc, 'sentiment'].iloc[0] for desc in group.split(', ')]
    negative_count = sentiments.count('negative')
    total_count = len(sentiments)
    negative_percentage = (negative_count / total_count) * 100
    negative_percentages.append(negative_percentage)

grouped_data['negative_percentage'] = negative_percentages

In [19]:
grouped_data

Unnamed: 0,representative_description,grouped_descriptions,count
0,description,"description, description",2


In [61]:
extracted_outbreak1 = df_outbreak1[['cases', 'sentiment', 'region', 'description', 'deaths', 'impact','impact_business', 'published_date']]

In [64]:
extracted_outbreak1.head(20)

Unnamed: 0,cases,sentiment,region,description,description.1,deaths,impact,impact_business,published_date
0,945,negative,"Rajasthan, Udaipur, Jaipur, Bikaner, Bhilwara,...","Among the 12 swine flu deaths, four cases were...","Swine flu outbreak in Rajasthan, India with 94...",12,,,2024-04-15 13:49:20
1,3,negative,"Nashik, India",The sudden rise in swine flu cases has sparked...,The article describes a concerning rise in swi...,1,,,2024-04-16 10:37:12
2,945,Negative,Rajasthan,Despite the significant rise in swine flu case...,The email describes an outbreak of swine flu i...,12,,,2024-04-14 07:03:00
3,945,Negative,"Rajasthan, Jaipur, Udaipur, Bhilwara, Bikaner,...",Swine flu caused fatalities in Rajasthan. Jaip...,The article describes an outbreak of swine flu...,12,,,2024-04-14 04:01:00
4,0,neutral,United States,U.S. swine herd has not been implicated in the...,The article discusses emerging and reemerging ...,0,,,2024-04-11 05:00:00
5,3,negative,"Nashik, Maharashtra",Nashik News : स्वाईन फ्ल्यूचा प्रादुर्भाव जवळप...,The article describes cases of swine flu and C...,1,,,2024-04-16 05:08:45
6,39-56 million,Neutral,"North America, Europe, Asia, Rest of World (ROW)",/PRNewswire/ -- The global influenza vaccine m...,The article discusses the global influenza vac...,"400,000-730,000",,,2024-04-16 14:24:00
7,No cases mentioned,Negative,"Kabupaten Sikka, Nusa Tenggara Timur","Pemkab Sikka, NTT telusuri laporan ternak babi...","The local government in Sikka Regency, East Nu...",No deaths mentioned,,,2024-04-15 10:00:53
8,No cases mentioned,Negative,"Minnesota, Texas, Michigan, Kansas, New Mexico...",But the rash of recent infections among livest...,The article discusses an outbreak of avian inf...,No deaths mentioned,,,2024-04-05 21:34:37
9,Unknown,Negative,"United States, Texas, Antarctica, Vietnam, Chile",This Flu Focus includes the challenges of avia...,The email discusses outbreaks of avian influen...,Unknown,,,2024-04-11 16:45:36


In [186]:
# RAG

In [None]:
%pip install --upgrade pip
%pip install boto3==1.33.2 --force-reinstall --quiet
%pip install botocore==1.33.2 --force-reinstall --quiet
%pip install -U opensearch-py==2.3.1
%pip install -U boto3==1.33.2
%pip install -U retrying==1.3.4

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [200]:
import json
import os
import boto3
from botocore.exceptions import ClientError
import pprint
from utility import create_bedrock_execution_role, create_oss_policy_attach_bedrock_execution_role, create_policies_in_oss, interactive_sleep
import random
from retrying import retry
suffix = random.randrange(200, 900)

sts_client = boto3.client('sts')
boto3_session = boto3.session.Session()
region_name = boto3_session.region_name
bedrock_agent_client = boto3_session.client('bedrock-agent', region_name=region_name)
service = 'aoss'
s3_client = boto3.client('s3')
account_id = sts_client.get_caller_identity()["Account"]
s3_suffix = f"{region_name}-{account_id}"
bucket_name = 'btr-data-1' # replace it with your bucket name.
pp = pprint.PrettyPrinter(indent=2)

bucket_name = 'btr-data-1' # replace it with your bucket name.
# Check if bucket exists, and if not create S3 bucket for knowledge base data source
try:
    s3_client.head_bucket(Bucket=bucket_name)
    print(f'Bucket {bucket_name} Exists')
except ClientError as e:
    print(f'Creating bucket {bucket_name}')
    s3bucket = s3_client.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={ 'LocationConstraint': region_name }
    )
import boto3
import time
vector_store_name = f'bedrock-sample-rag-{suffix}'
index_name = f"bedrock-sample-rag-index-{suffix}"
aoss_client = boto3_session.client('opensearchserverless')
bedrock_kb_execution_role = create_bedrock_execution_role(bucket_name='btr-data-1')
bedrock_kb_execution_role_arn = bedrock_kb_execution_role['Role']['Arn']

In [205]:
# Create security, network and data access policies within OSS
encryption_policy, network_policy, access_policy = create_policies_in_oss(vector_store_name=vector_store_name,
                       aoss_client=aoss_client,
                       bedrock_kb_execution_role_arn=bedrock_kb_execution_role_arn)
collection = aoss_client.create_collection(name=vector_store_name,type='VECTORSEARCH')

In [None]:
# Get the OpenSearch serverless collection URL
collection_id = collection['createCollectionDetail']['id']
host = collection_id + '.' + region_name + '.aoss.amazonaws.com'
print(host)

In [None]:
# Wait for collection creation
# This can take couple of minutes to finish
response = aoss_client.batch_get_collection(names=[vector_store_name])
# Periodically check collection status
while (response['collectionDetails'][0]['status']) == 'CREATING':
    print('Creating collection...')
    interactive_sleep(5)
    response = aoss_client.batch_get_collection(names=[vector_store_name])
print('\nCollection successfully created:')
pp.pprint(response["collectionDetails"])

In [None]:
# Create opensearch serverless access policy and attach it to Bedrock execution role
try:
    create_oss_policy_attach_bedrock_execution_role(collection_id=collection_id,
                                                    bedrock_kb_execution_role=bedrock_kb_execution_role)
    # It can take up to a minute for data access rules to be enforced
    interactive_sleep(60)
except Exception as e:
    print("Policy already exists")
    pp.pprint(e)

In [211]:
# Create the vector index in Opensearch serverless, with the knn_vector field index mapping, specifying the dimension size, name and engine.
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, RequestError
credentials = boto3.Session().get_credentials()
awsauth = auth = AWSV4SignerAuth(credentials, region_name, service)

index_name = f"bedrock-sample-index-{suffix}"
body_json = {
   "settings": {
      "index.knn": "true",
       "number_of_shards": 1,
       "knn.algo_param.ef_search": 512,
       "number_of_replicas": 0,
   },
   "mappings": {
      "properties": {
         "vector": {
            "type": "knn_vector",
            "dimension": 1536,
             "method": {
                 "name": "hnsw",
                 "engine": "faiss",
                 "space_type": "l2"
             },
         },
         "text": {
            "type": "text"
         },
         "text-metadata": {
            "type": "text"         }
      }
   }
}

# Build the OpenSearch client
oss_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    timeout=300
)

In [None]:
# Create index
try:
    response = oss_client.indices.create(index=index_name, body=json.dumps(body_json))
    print('\nCreating index:')
    pp.pprint(response)

    # index creation can take up to a minute
    interactive_sleep(60)
except RequestError as e:
    # you can delete the index if its already exists
    # oss_client.indices.delete(index=index_name)
    print(f'Error while trying to create the index, with error {e.error}\nyou may unmark the delete above to delete, and recreate the index')
    

In [217]:
opensearchServerlessConfiguration = {
            "collectionArn": collection["createCollectionDetail"]['arn'],
            "vectorIndexName": index_name,
            "fieldMapping": {
                "vectorField": "vector",
                "textField": "text",
                "metadataField": "text-metadata"
            }
        }

# Ingest strategy - How to ingest data from the data source
chunkingStrategyConfiguration = {
    "chunkingStrategy": "FIXED_SIZE",
    "fixedSizeChunkingConfiguration": {
        "maxTokens": 512,
        "overlapPercentage": 20
    }
}

# The data source to ingest documents from, into the OpenSearch serverless knowledge base index
s3Configuration = {
    "bucketArn": f"arn:aws:s3:::{bucket_name}",
    # "inclusionPrefixes":["*.*"] # you can use this if you want to create a KB using data within s3 prefixes.
}

# The embedding model used by Bedrock to embed ingested documents, and realtime prompts
embeddingModelArn = f"arn:aws:bedrock:{region_name}::foundation-model/amazon.titan-embed-text-v1"

name = f"bedrock-sample-knowledge-base-{suffix}"
description = "RAG"
roleArn = bedrock_kb_execution_role_arn

In [218]:
# Create a KnowledgeBase
from retrying import retry

@retry(wait_random_min=1000, wait_random_max=2000,stop_max_attempt_number=7)
def create_knowledge_base_func():
    create_kb_response = bedrock_agent_client.create_knowledge_base(
        name = name,
        description = description,
        roleArn = roleArn,
        knowledgeBaseConfiguration = {
            "type": "VECTOR",
            "vectorKnowledgeBaseConfiguration": {
                "embeddingModelArn": embeddingModelArn
            }
        },
        storageConfiguration = {
            "type": "OPENSEARCH_SERVERLESS",
            "opensearchServerlessConfiguration":opensearchServerlessConfiguration
        }
    )
    return create_kb_response["knowledgeBase"]

In [219]:
try:
    kb = create_knowledge_base_func()
except Exception as err:
    print(f"{err=}, {type(err)=}")

In [221]:
# Get KnowledgeBase 
get_kb_response = bedrock_agent_client.get_knowledge_base(knowledgeBaseId = kb['knowledgeBaseId'])

In [None]:
# Create a DataSource in KnowledgeBase 
create_ds_response = bedrock_agent_client.create_data_source(
    name = name,
    description = description,
    knowledgeBaseId = kb['knowledgeBaseId'],
    dataSourceConfiguration = {
        "type": "S3",
        "s3Configuration":s3Configuration
    },
    vectorIngestionConfiguration = {
        "chunkingConfiguration": chunkingStrategyConfiguration
    }
)
ds = create_ds_response["dataSource"]
pp.pprint(ds)

In [None]:
# Get DataSource 
bedrock_agent_client.get_data_source(knowledgeBaseId = kb['knowledgeBaseId'], dataSourceId = ds["dataSourceId"])

In [224]:
# Start an ingestion job
start_job_response = bedrock_agent_client.start_ingestion_job(knowledgeBaseId = kb['knowledgeBaseId'], dataSourceId = ds["dataSourceId"])

In [None]:
job = start_job_response["ingestionJob"]
pp.pprint(job)

In [None]:
# Get job 
while(job['status']!='COMPLETE' ):
  get_job_response = bedrock_agent_client.get_ingestion_job(
      knowledgeBaseId = kb['knowledgeBaseId'],
        dataSourceId = ds["dataSourceId"],
        ingestionJobId = job["ingestionJobId"]
  )
  job = get_job_response["ingestionJob"]
pp.pprint(job)
interactive_sleep(40)

In [None]:
# Print the knowledge base Id in bedrock, that corresponds to the Opensearch index in the collection we created before, we will use it for the invocation later
kb_id = kb["knowledgeBaseId"]
pp.pprint(kb_id)

# Keep the kb_id for invocation later in the invoke request
%store kb_id

In [232]:
# Try out KB using RetrieveAndGenerate API
bedrock_agent_runtime_client = boto3.client("bedrock-agent-runtime", region_name=region_name)
# Lets see how different Anthropic models responds to the input text we provide
claude_model_ids = [ ["Claude 3 Sonnet", "anthropic.claude-3-sonnet-20240229-v1:0"] ]

In [233]:
def ask_bedrock_llm_with_knowledge_base(query: str, model_arn: str, kb_id: str) -> str:
    response = bedrock_agent_runtime_client.retrieve_and_generate(
        input={
            'text': query
        },
        retrieveAndGenerateConfiguration={
            'type': 'KNOWLEDGE_BASE',
            'knowledgeBaseConfiguration': {
                'knowledgeBaseId': kb_id,
                'modelArn': model_arn
            }
        },
    )

    generated_text = response['output']['text']
    return generated_text

In [255]:
prompt2 = """

Human: Summarise the following content to 500 characters. The final output should only have 500 characters

<context>
{data_test}
</context>

Assistant:"""

query2 = prompt2.format(data_test=df['ClusterText'][0])
result2 = llm(query2).strip()

In [None]:
query = "The following text contains desciption on an outbreak in the current_outbreak_tag. <current_outbreak>" + result2 + "</current_outbreak. Use the data extracted \
    on " + df_final.name[0] + " to understand whether the current_outbreak_tag outbreak is as bad as the outbreaks reported before"

for model_id in claude_model_ids:
    model_arn = f'arn:aws:bedrock:{region_name}::foundation-model/{model_id[1]}'
    generated_text = ask_bedrock_llm_with_knowledge_base(query, model_arn, kb_id)
    
    print(f"Generated using Amazon Bedrock and {model_id[0]}:")
    pp.pprint(generated_text)
    print()