In [None]:
import json
import os
from openai import OpenAI

In [None]:
# Models: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo

In [None]:
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

In [None]:
prompt_1 = """You are a skilled analyst tasked with analyzing articles. Please analyze a given text and return annotations in JSON format on the likelihood (between 0 and 1) of certain topics / threats being detected in the text. Also annotate the indices (start and end) of where the specific topics occurred within and relative to the text. Topics are predetermined, keywords are not. Return up to 5 keywords.

Indices should indicate a range of characters where the topic is detected. The first character in the text is at index 0. The last character in the text is at index n-1, where n is the number of characters in the text. The indices should be in the format of a list of lists, where each sublist contains two integers: the start index and the end index. For example, [[5, 302], [720, 1302], ...]

Return only the formatted JSON object with the data from text below "-----". The keys in the topics field are predetermined and should not be altered (no keys changed, added or deleted). Each topic should contain a "topicLikelihood" and a list of "indices", if any parts of the text fit that topic. Return empty if none.
Output should be just the raw JSON object, no print statements, formatting or other output.

Example response (JSON object):
```json
{
	"keywords": ["technology", "crypto", "gpu"],
	"topics": {
		"supplyChainRisk": { // demonstrates a risk of supply chain issues
			"topicLikelihood": 0.234,
			"indices": [[5, 302], [720, 1302], ...]
		},
		"economicRisk": { // risk to economy
			...
		},
		"materialsRisk": { // risk of material shortages or large price changes
			...
		}
	}
}
```

Please check the validity of the returned JSON object and its contents before submitting. Thank you for your assistance:^)!

-----"""

prompt_2 = """You are a skilled analyst tasked with analyzing articles. Please analyze a given text and return annotations in JSON format on the likelihood (between 0 and 1) of certain topics / threats being detected in the text. Also annotate the indices (start and end) of where the specific topics occurred within and relative to the text. Topics are predetermined, keywords are not. Return up to 5 keywords.

Indices should indicate a range of characters where the topic is detected. The first character in the text is at index 0. The last character in the text is at index n-1, where n is the number of characters in the text. The indices should be in the format of a list of lists, where each sublist contains two integers: the start index and the end index. For example, [[5, 302], [720, 1302], ...]

Return only the formatted JSON object with the data from text below "-----". The keys in the topics field are predetermined and should not be altered (no keys changed, added or deleted). Each topic should contain a "topicLikelihood" and a list of "indices", if any parts of the text fit that topic. Return empty if none.
Output should be just the raw JSON object, no print statements, formatting or other output.

Example response (JSON object):
```json
{
	"keywords": ["technology", "crypto", "gpu"],
	"topics": {
		"supplyChainRisk": { // demonstrates a risk of supply chain issues
			"topicLikelihood": 0.234,
			"indices": [[5, 302], [720, 1302], ...]
		},
		"economicRisk": { // risk to economy
			...
		},
		"materialsRisk": { // risk of material shortages or large price changes
			...
		}
	}
}
```

Please check the validity of the returned JSON object and its contents before submitting. Thank you for your assistance:^)!

Start by creating a JSON object which lists the text sections which fit a specific topic, then print the JSON object. Also create a JSON object the indices of said sections, then print the JSON object. Finally, create a JSON object which lists the likelihood of a specific topic being present in the text, then print the JSON object.

-----"""

# two shot example
prompt_3 = """# Task Description

You are a skilled analyst tasked with discovering potential threats within article text. Please analyze a given text and return results in JSON format. The JSON object can contain up to 5 keywords. The topics are fixed to "supplyChainThreat" (potential supply chain threats and issues), "economicThreat" (threats to the economy as a whole), and "ecologicalThreat" (threats to the environment, natural disasters, ...). Each topic should contain a "topicLikelihood" and a list of "indices", if any parts of the text fit that topic. Return empty list if none.

Example response (JSON object):
```json
{
	"keywords": ["technology", "crypto", "gpu"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0.934,
			"explanation": "Recent increases in cryptocurrency mining profitability have led to a surge in demand for GPUs, causing a shortage in supply and a significant increase in prices.",
			"indices": [[5, 302], [720, 1302], [2501, 2629]]
		},
		"economicThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"ecologicalThreat": {
			"topicLikelihood": 0.234,
			"explanation": "The increased demand for GPUs has led to a surge in production, which has a potential to cause environmental damage.",
			"indices": [[202, 720]]
		}
	}
}
```

Please check the validity of the returned JSON object and its contents before submitting. Thank you for your assistance:^)!

# Examples

## Example 1

Q: "The recent surge in demand for GPUs has led to a shortage in supply. This has caused a significant increase in prices and has a potential to disrupt the market."

A: 
```json
{
	"keywords": ["supply shortage", "price increases", "market disruption"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0.934,
			"explanation": "Higher demand for GPUs has led to a shortage in supply",
			"indices": [[0, 68]]
		},
		"economicThreat": {
			"topicLikelihood": 0.828,
			"explanation": "Price increases can disrupt the market",
			"indices": [[87, 159]]
		},
		"ecologicalThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		}
	}
}
```

## Example 2

Q: "Planning for a Summer vacation? Here are top 10 destinations to visit in 2024! While some of these may be a bit random and remote, they are definitely worth the trip."

A: 
```json
{
	"keywords": ["summer vacation", "advertisement"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"economicThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"ecologicalThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		}
	}
}
```

# Prompt

Q: """

# same as prompt_3 but with explanation on indice counting
prompt_4 = """# Task Description

You are a skilled analyst tasked with discovering potential threats within article text. Please analyze a given text and return results in JSON format. The JSON object can contain up to 5 keywords. The topics are fixed to "supplyChainThreat" (potential supply chain threats and issues), "economicThreat" (threats to the economy as a whole), and "ecologicalThreat" (threats to the environment, natural disasters, ...). Each topic should contain a "topicLikelihood" and a list of "indices", if any parts of the text fit that topic. Return empty list if none.

Counting indices occurs as follows:
"Counting example" -> "C" (0), "o" (1), "u" (2), "n" (3), "t" (4), "i" (5), "n" (6), "g" (7), " " (8), "e" (9), "x" (10), "a" (11), "m" (12), "p" (13), "l" (14), "e" (15)
Indices of "Counting": [0, 7]
Indices of "example": [9, 15]

When you find occurence of a topic (e.g. one that could be classified as a "supplyChainThreat"), you should return the indices of the first and last character of the word in the text. If the word occurs multiple times, return all occurences that match the topic. If the topic does not occur, return an empty list.

Example response (JSON object):
```json
{
	"keywords": ["technology", "crypto", "gpu"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0.934,
			"explanation": "Recent increases in cryptocurrency mining profitability have led to a surge in demand for GPUs, causing a shortage in supply and a significant increase in prices.",
			"indices": [[5, 302], [720, 1302], [2501, 2629]]
		},
		"economicThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"ecologicalThreat": {
			"topicLikelihood": 0.234,
			"explanation": "The increased demand for GPUs has led to a surge in production, which has a potential to cause environmental damage.",
			"indices": [[202, 720]]
		}
	}
}
```

Please check the validity of the returned JSON object and its contents before submitting. Thank you for your assistance:^)!

# Examples

## Example 1

Q: "The recent surge in demand for GPUs has led to a shortage in supply. This has caused a significant increase in prices and has a potential to disrupt the market."

A: 
```json
{
	"keywords": ["supply shortage", "price increases", "market disruption"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0.934,
			"explanation": "Higher demand for GPUs has led to a shortage in supply",
			"indices": [[0, 68]]
		},
		"economicThreat": {
			"topicLikelihood": 0.828,
			"explanation": "Price increases can disrupt the market",
			"indices": [[87, 159]]
		},
		"ecologicalThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		}
	}
}
```

## Example 2

Q: "Planning for a Summer vacation? Here are top 10 destinations to visit in 2024! While some of these may be a bit random and remote, they are definitely worth the trip."

A: 
```json
{
	"keywords": ["summer vacation", "advertisement"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"economicThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"ecologicalThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		}
	}
}
```

# Prompt

Q: """

# similar to prompt_4 but broken down into user and assistant messages
prompt_5 = """# Task Description

You are a skilled analyst tasked with discovering potential threats within article text. Please analyze a given text and return results in JSON format. The JSON object can contain up to 5 keywords. The topics are fixed to "supplyChainThreat" (potential supply chain threats and issues), "economicThreat" (threats to the economy as a whole), and "ecologicalThreat" (threats to the environment, natural disasters, ...). Each topic should contain a "topicLikelihood" and a list of "indices", if any parts of the text fit that topic. Return empty list if none.

Counting indices occurs as follows:
"Counting example" -> "C" (0), "o" (1), "u" (2), "n" (3), "t" (4), "i" (5), "n" (6), "g" (7), " " (8), "e" (9), "x" (10), "a" (11), "m" (12), "p" (13), "l" (14), "e" (15)
Indices of "Counting": [0, 7]
Indices of "example": [9, 15]

When you find occurrence of a topic (e.g. one that could be classified as a "supplyChainThreat"), you should return the indices of the first and last character of the word in the text. If the word occurs multiple times, return all occurrences that match the topic. If the topic does not occur, return an empty list. Topics can also overlap.

Example response (JSON object):
```json
{
	"keywords": ["technology", "crypto", "gpu"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0.934,
			"explanation": "Recent increases in cryptocurrency mining profitability have led to a surge in demand for GPUs, causing a shortage in supply and a significant increase in prices.",
			"indices": [[5, 302], [720, 1302], [2501, 2629]]
		},
		"economicThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"ecologicalThreat": {
			"topicLikelihood": 0.234,
			"explanation": "The increased demand for GPUs has led to a surge in production, which has a potential to cause environmental damage.",
			"indices": [[202, 720]]
		}
	}
}
```

Please check the validity of the returned JSON object and its contents before submitting. Thank you for your assistance:^)!
"""

prompt_5_user_1 = "The recent surge in demand for GPUs has led to a shortage in supply. This has caused a significant increase in prices and has a potential to disrupt the market."

prompt_5_assistant_1 = """```json
{
	"keywords": ["supply shortage", "price increases", "market disruption"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0.934,
			"explanation": "Higher demand for GPUs has led to a shortage in supply",
			"indices": [[0, 68]]
		},
		"economicThreat": {
			"topicLikelihood": 0.828,
			"explanation": "Price increases can disrupt the market",
			"indices": [[87, 159]]
		},
		"ecologicalThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		}
	}
}
```"""

prompt_5_user_2 = "Planning for a Summer vacation? Here are top 10 destinations to visit in 2024! While some of these may be a bit random and remote, they are definitely worth the trip."

prompt_5_assistant_2 = """```json
{
	"keywords": ["summer vacation", "advertisement"],
	"topics": {
		"supplyChainThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"economicThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		},
		"ecologicalThreat": {
			"topicLikelihood": 0,
			"explanation": "",
			"indices": []
		}
	}
}
```"""

# Returns substrings rather than indices - more accurate
prompt_6 = """Can you please list all substrings occurring within the below text, which could be categorized as a potential supply chain threat? Do not describe them, just list the substrings contained within a text. If the text is "Last Friday night was very eventful. A ship got stuck in a canal which could delay shipments of goods", then the list should include a single entry "A ship got stuck in a canal which could delay shipments of goods". Do similar analysis for economic threats and ecological threats (like natural disasters and such). Up to 5 keywords can be returned. If no occurrences are found return an empty list for said topic. The same substrings can belong to multiple topics (can overlap). The return format should be JSON.

Example JSON response:
```json
{
	"keywords": ["ship stuck", "shipment delay"],
	"topics": {
		"supplyChainThreat": {
			"likelihood": 0.932,
			"explanation": "Stuck ship could delay shipments.",
			"occurrences": ["A ship got stuck in a canal which could delay shipments of goods"]
		},
		"economicThreat": {
			"likelihood": 0.269,
			"explanation": "",
			"occurrences": []
		},
		"ecologicalThreat": {
			"likelihood": 0.469,
			"explanation": "A ship got stuck which could cause ecological damage if it spills oil.",
			"occurrences": ["A ship got stuck in a canal"]
		}
	}
}
```

Text:

"""

prompt_7 = """Can you please list all substrings occurring within the below text, which could be categorized as a potential supply chain threat? Do not describe them, just list the substrings contained within a text. If the text is "Last Friday night was very eventful. A ship got stuck in a canal which could delay shipments of goods", then the list should include a single entry "A ship got stuck in a canal which could delay shipments of goods". Do similar analysis for economic threats and ecological threats (like natural disasters and such). Up to 5 keywords can be returned. If no occurrences are found return an empty list for said topic. The same substrings can belong to multiple topics (can overlap). Also give an estimate of likelihood for each topic being present in the text and a short explanation if topic is present (otherwise set explanation to ""). Topics can have any number of substring occurrences, spanning the entire text. The return format should be JSON.

Example JSON response:
```json
{
	"keywords": ["ship stuck", "shipment delay"],
	"topics": {
		"supplyChainThreat": {
			"likelihood": 0.932,
			"explanation": "Stuck ship could delay shipments.",
			"occurrences": ["A ship got stuck in a canal which could delay shipments of goods"]
		},
		"economicThreat": {
			"likelihood": 0.269,
			"explanation": "",
			"occurrences": []
		},
		"ecologicalThreat": {
			"likelihood": 0.469,
			"explanation": "A ship got stuck which could cause ecological damage if it spills oil.",
			"occurrences": ["A ship got stuck in a canal"]
		}
	}
}
```

Text:
"""

prompt_8 = """Can you please list all substrings occurring within the below text, which could be categorized as a potential supply chain threat? Do not describe them, just list all the substrings contained within the text. If the text is "Last Friday night was very eventful. A ship got stuck in a canal which could delay shipments of goods", then the list should include a single entry "A ship got stuck in a canal which could delay shipments of goods". Do similar analysis for economic threats and ecological threats (like natural disasters and such). If no occurrences are found return an empty list for said topic. The same substrings can belong to multiple topics (can overlap). Topics can have any number of substring occurrences, spanning the entire text. Also return a reasonable number a keywords. Also give an estimate of likelihood for each topic being present in the text and a short explanation if topic is present (otherwise set explanation to ""). The return format should be JSON.

Example JSON response:
```json
{
	"keywords": ["ship stuck", "shipment delay"],
	"topics": {
		"supplyChainThreat": {
			"likelihood": 0.932,
			"explanation": "Stuck ship could delay shipments.",
			"occurrences": ["A ship got stuck in a canal which could delay shipments of goods"]
		},
		"economicThreat": {
			"likelihood": 0.269,
			"explanation": "",
			"occurrences": []
		},
		"ecologicalThreat": {
			"likelihood": 0.469,
			"explanation": "A ship got stuck which could cause ecological damage if it spills oil.",
			"occurrences": ["A ship got stuck in a canal"]
		}
	}
}
```

Text:
"""

prompt_9 = """Can you please list all substrings occurring within the below text, which could be categorized as a potential supply chain threat? Do not describe them, just list all the substrings contained within the text. If the text is "Last Friday night was very eventful. A ship got stuck in a canal which could delay shipments of goods", then the list should include a single entry "A ship got stuck in a canal which could delay shipments of goods". Do similar analysis for economic threats and ecological threats (like natural disasters, extreme weather events, and such). If no occurrences are found, return an empty list for said topic. Parts of substrings can belong to multiple topics (can overlap). Topics can have any number of substring occurrences, spanning the entire text, not just the first part. Also return a reasonable number a keywords. Also give an estimated likelihood for each topic being present in the text and a short explanation if topic is present (otherwise set explanation to ""). The return format should be JSON.

Example JSON response:
```json
{
	"keywords": ["ship stuck", "shipment delay"],
	"topics": {
		"supplyChainThreat": {
			"likelihood": 0.932,
			"explanation": "Stuck ship could delay shipments.",
			"occurrences": ["A ship got stuck in a canal which could delay shipments of goods"]
		},
		"economicThreat": {
			"likelihood": 0.269,
			"explanation": "",
			"occurrences": []
		},
		"ecologicalThreat": {
			"likelihood": 0.469,
			"explanation": "A ship got stuck which could cause ecological damage if it spills oil.",
			"occurrences": ["A ship got stuck in a canal"]
		}
	}
}
```

Text:
"""

article_1 = """Title: Navigating the Global Semiconductor Shortage: Supply Chain Challenges and Solutions

As the world becomes increasingly reliant on technology, the global semiconductor shortage presents a critical challenge for supply chains across various industries. This shortage, stemming from a combination of factors including increased demand for electronics, supply chain disruptions, and geopolitical tensions, has led to significant repercussions for businesses worldwide.

One of the primary issues arising from the semiconductor shortage is the disruption of production schedules. Companies reliant on semiconductors for their products, such as automotive manufacturers and consumer electronics companies, are facing delays in production and delivery. This not only impacts their bottom line but also affects consumer access to essential goods and services.

Moreover, the semiconductor shortage highlights vulnerabilities in global supply chains. Many companies rely on a limited number of suppliers, often located in regions prone to geopolitical tensions or natural disasters. This concentration of suppliers amplifies the impact of disruptions, as seen with the semiconductor shortage.

To address these challenges, companies are exploring various strategies to diversify their supply chains. This includes investing in local manufacturing facilities, collaborating with multiple suppliers, and adopting advanced technologies like artificial intelligence to optimize inventory management and production processes.

In conclusion, the global semiconductor shortage underscores the importance of resilient and flexible supply chains. By proactively addressing these challenges and implementing strategic solutions, businesses can mitigate the impact of future disruptions and ensure a more stable supply of essential components."""

article_2 = """Title: The Rise of Telemedicine: Transforming Healthcare Delivery in the Digital Age

Telemedicine, once a niche concept, has rapidly emerged as a cornerstone of modern healthcare delivery, revolutionizing patient care and access to medical services. With advancements in technology and changing patient preferences, telemedicine is reshaping the healthcare landscape, offering numerous benefits for both patients and providers.

One of the key advantages of telemedicine is its ability to enhance accessibility to healthcare services, particularly in underserved or remote areas. Patients no longer need to travel long distances to see a healthcare provider; instead, they can consult with a physician remotely via video conferencing or telehealth platforms. This not only saves time and money but also improves healthcare outcomes by facilitating timely interventions and follow-ups.

Moreover, telemedicine has proven to be a valuable tool in improving patient engagement and adherence to treatment plans. Through remote monitoring devices and teleconsultations, patients can actively participate in their healthcare journey, leading to better management of chronic conditions and overall wellness.

Furthermore, telemedicine has played a crucial role in healthcare delivery during the COVID-19 pandemic, enabling continuity of care while minimizing the risk of viral transmission. Healthcare providers quickly adopted telemedicine solutions to offer virtual consultations, triage patients, and deliver mental health services remotely, ensuring that patients received necessary care despite lockdowns and social distancing measures.

As telemedicine continues to evolve, it presents new opportunities for innovation and collaboration within the healthcare industry. From artificial intelligence-driven diagnostics to virtual reality-enabled therapies, the possibilities for leveraging technology to improve patient care are endless.

In conclusion, telemedicine represents a transformative shift in healthcare delivery, offering enhanced accessibility, patient engagement, and innovation. As technology continues to advance, telemedicine will play an increasingly vital role in shaping the future of healthcare, providing patients with convenient and effective ways to access quality medical services."""




In [None]:
# completion = client.chat.completions.create(
# 	messages=[
# 		{
# 				"role": "user",
# 				"content": "Say this is a test",
# 		}
# 	],
# 	model="gpt-3.5-turbo",
# )
# response = completion.choices[0].message.content
# print(f"Response:\n'{response}'")

In [None]:
def get_response(messages: list[dict[str, str]], model: str = "gpt-3.5-turbo", temperature: float = 0.2, top_p: float = 0.1) -> str:
	# completion = client.chat.completions.create(
	# 	messages=messages, # type: ignore
	# 	model=model,
	# )
	# set temperature to 0.2 and top_p to 0.1 to get more consistent results
	completion = client.chat.completions.create(
		messages=messages, # type: ignore
		model=model,
		temperature=temperature,
		top_p=top_p,
	)
	return completion.choices[0].message.content # type: ignore

def get_messages(prompt: str, article: str) -> list[dict[str, str]]:
	return [
		{
			"role": "system",
			"content": prompt,
		},
		{
			"role": "user",
			"content": article,
		},
	]

# messages = get_messages(prompt_1, article_1)
# response = get_response(messages)
# print(f"Response:\n'{response}'")

In [None]:
# response_json = json.loads(response)
# print(f"Response JSON:\n{json.dumps(response_json, indent=2)}")

In [None]:
def print_results(article_text: str, result_json: dict):
	'''
		Prints keywords and words per topic
	'''
	print(f"Keywords: {result_json['keywords']}")
	for topic, topic_data in result_json['topics'].items():
		print(f"Topic: {topic}")
		print(f"Likelihood: {topic_data['topicLikelihood']}")
		print(f"Indices: ({len(topic_data['indices'])}):")
		for indices in topic_data['indices']:
			start = indices[0]
			end = indices[1]
			print(f"- '{article_text[start:end + 1]}'")

# print_results(article_1, response_json)

In [None]:
# gpt-4-1106-preview

chosen_article = article_1
# chosen_article = article_2

# messages = get_messages(prompt_4, chosen_article)
# messages = [
# 	{
# 		"role": "system",
# 		"content": prompt_5,
# 	},
# 	{
# 		"role": "user",
# 		"content": prompt_5_user_1,
# 	},
# 	{
# 		"role": "assistant",
# 		"content": prompt_5_assistant_1,
# 	},
# 	{
# 		"role": "user",
# 		"content": prompt_5_user_2,
# 	},
# 	{
# 		"role": "assistant",
# 		"content": prompt_5_assistant_2,
# 	},
# 	{
# 		"role": "user",
# 		"content": chosen_article,
# 	},
# ]
# messages = get_messages(prompt_6, chosen_article)
messages = get_messages(prompt_8, chosen_article)

model = "gpt-3.5-turbo"
model = "gpt-4-1106-preview"
# model = "gpt-4"

# temperature = 0.2
temperature = 0.2
top_p = 0.1

response = get_response(messages, model, temperature=temperature, top_p=top_p)
print(f"Response:\n'{response}'")

In [None]:
chat_gpt_response = """{
	"keywords": ["semiconductor shortage", "supply chain disruptions", "geopolitical tensions", "natural disasters", "production delays"],
	"topics": {
		"supplyChainThreat": {
			"likelihood": 0.893,
			"explanation": "Semiconductor shortage causing disruptions in global supply chains.",
			"occurrences": ["global semiconductor shortage presents a critical challenge for supply chains", "One of the primary issues arising from the semiconductor shortage is the disruption of production schedules", "the semiconductor shortage highlights vulnerabilities in global supply chains", "This concentration of suppliers amplifies the impact of disruptions, as seen with the semiconductor shortage"]
		},
		"economicThreat": {
			"likelihood": 0.732,
			"explanation": "Production delays affecting businesses worldwide.",
			"occurrences": ["This shortage, stemming from a combination of factors including increased demand for electronics, supply chain disruptions, and geopolitical tensions, has led to significant repercussions for businesses worldwide", "Companies reliant on semiconductors for their products, such as automotive manufacturers and consumer electronics companies, are facing delays in production and delivery"]
		},
		"ecologicalThreat": {
			"likelihood": 0.127,
			"explanation": "",
			"occurrences": []
		}
	}
}
"""

chat_gpt_response = """{
	"keywords": ["semiconductor shortage", "supply chain disruptions", "production delays", "geopolitical tensions", "natural disasters"],
	"topics": {
		"supplyChainThreat": {
			"likelihood": 0.932,
			"explanation": "Semiconductor shortage causing disruptions in production schedules.",
			"occurrences": ["global semiconductor shortage presents a critical challenge for supply chains", "disruption of production schedules", "companies reliant on semiconductors for their products, such as automotive manufacturers and consumer electronics companies, are facing delays in production and delivery", "vulnerabilities in global supply chains", "companies rely on a limited number of suppliers, often located in regions prone to geopolitical tensions or natural disasters"]
		},
		"economicThreat": {
			"likelihood": 0.269,
			"explanation": "Semiconductor shortage leading to delays impacting businesses worldwide.",
			"occurrences": ["Companies reliant on semiconductors for their products, such as automotive manufacturers and consumer electronics companies, are facing delays in production and delivery"]
		},
		"ecologicalThreat": {
			"likelihood": 0.469,
			"explanation": "Natural disasters could exacerbate supply chain vulnerabilities.",
			"occurrences": []
		}
	}
}
"""

chat_gpt_response_json = json.loads(chat_gpt_response)

In [None]:
def get_indices_from_occurrences(article_text: str, result_json: dict) -> dict:
	'''
		Returns a dictionary with the same structure as result_json, but with indices instead of occurrences
	'''
	indices_json = {}
	indices_json["keywords"] = result_json["keywords"]
	indices_json["topics"] = {}
	article_text_lower = article_text.lower()
	for topic, topic_data in result_json["topics"].items():
		indices_json["topics"][topic] = {}
		indices_json["topics"][topic]["topicLikelihood"] = topic_data["likelihood"]
		indices_json["topics"][topic]["explanation"] = topic_data["explanation"]
		indices_json["topics"][topic]["indices"] = []
		for occurrence in topic_data["occurrences"]:
			start = article_text_lower.find(occurrence.lower())
			if start == -1:
				print(f"Could not find occurrence '{occurrence}' in article text for topic '{topic}'")
				continue
			end = start + len(occurrence) - 1
			indices_json["topics"][topic]["indices"].append([start, end])
	return indices_json

# response_json = get_indices_from_occurrences(chosen_article, chat_gpt_response_json)
response_json = get_indices_from_occurrences(chosen_article, json.loads(response.replace("```json", "").replace("```", "").strip()))
print(f"Response JSON:\n{json.dumps(response_json, indent=2)}")

In [None]:
# Here you can overload with past recorded responses

# response_json = {
#   "keywords": [
#     "global semiconductor shortage",
#     "supply chain disruptions",
#     "geopolitical tensions",
#     "disruption of production schedules",
#     "concentration of suppliers"
#   ],
#   "topics": {
#     "supplyChainThreat": {
#       "topicLikelihood": 0.95,
#       "explanation": "The text discusses various factors contributing to supply chain challenges, including the semiconductor shortage, disruptions, and supplier concentration.",
#       "indices": [
#         [153, 229],
#         [356, 379],
#         [546, 579],
#         [1086, 1147]
#       ]
#     },
#     "economicThreat": {
#       "topicLikelihood": 0.85,
#       "explanation": "The semiconductor shortage and subsequent production delays have significant repercussions for businesses, potentially affecting the broader economy.",
#       "indices": [
#         [420, 469],
#         [546, 579],
#         [767, 791]
#       ]
#     },
#     "ecologicalThreat": {
#       "topicLikelihood": 0.1,
#       "explanation": "",
#       "indices": []
#     }
#   }
# }

In [None]:
# response_json = json.loads(response.replace("```json", "").replace("```", "").strip())
print_results(chosen_article, response_json)

In [None]:
# path_html_preview = "data/analysis/LLM/test.html"
path_html_preview = "src/py/analysis/LLM/test.html"

def generate_highlighted_html(article_text: str, result_json: dict) -> str:
	'''
		Generates HTML with highlighted text
	'''
	article_text = article_text #.replace("\n", " ").replace("  ", " ")
	colors = ["#00B9E0", "#E0CB00", "#E0005B"]
	# opacity = 0.5
	opacity = 0.5
	colors = [f"rgba(0, 185, 224, {opacity})", f"rgba(224, 203, 0, {opacity})", f"rgba(224, 0, 91, {opacity})"]
	# html = f"<body style='background-color: #ffe8bd;'>"
	html = f"<body style='background-color: #faeacd; font-family: Arial, sans-serif; padding: 1em;'>"
	html += "<h1>Article</h1>"
	html += f"<h4>"
	for i, keyword in enumerate(result_json['keywords']):
		html += f"{keyword}"
		if i < len(result_json['keywords']) - 1:
			html += f" • "
	html += f"</h4>"
	html += f"<p>"
	for i, c in enumerate(article_text):
		# iterate over all start indices of topics
		for j, (topic, topic_data) in enumerate(result_json['topics'].items()):
			for indices in topic_data['indices']:
				start = indices[0]
				end = indices[1]
				if start == i:
					html += f"<mark style='background-color: {colors[j]}'>"
			# break
		html += c
		# iterate over all end indices of topics
		for j, (topic, topic_data) in enumerate(result_json['topics'].items()):
			for indices in topic_data['indices']:
				start = indices[0]
				end = indices[1]
				if end == i:
					html += "</mark>"
		# html += c
	html = html.replace("\n", "<br>")
	html += f"</p>"
	# add legend (colored circle + topic name)
	html += "<h1>Legend</h1>"
	html += f"<p>"
	for i, (topic, topic_data) in enumerate(result_json['topics'].items()):
		# html += f"<p><span style='background-color: {colors[i]}; border-radius: 50%; display: inline-block; height: 20px; width: 20px;'></span> {topic}</p>"
		html += f"<mark style='background-color: {colors[i]}; margin-right: 20px;'>{topic}</mark>"
	html += f"</p>"
	html += f"<p>"
	# legend combination colors (overlay them on top of each other)
	html += f"<mark style='background-color: {colors[0]}; margin-right: 20px;'><mark style='background-color: {colors[1]};'>{list(response_json['topics'].keys())[0]} + {list(response_json['topics'].keys())[1]}</mark></mark></mark>"
	# html += f"<p><mark style='background-color: {colors[1]};'><mark style='background-color: {colors[0]};'>{list(response_json['topics'].keys())[1]} + {list(response_json['topics'].keys())[0]}</mark></mark></p>" # for some reason just inverting the order of the colors produces a different color
	html += f"<mark style='background-color: {colors[0]}; margin-right: 20px;'><mark style='background-color: {colors[2]};'>{list(response_json['topics'].keys())[0]} + {list(response_json['topics'].keys())[2]}</mark></mark>"
	html += f"<mark style='background-color: {colors[1]}; margin-right: 20px;'><mark style='background-color: {colors[2]};'>{list(response_json['topics'].keys())[1]} + {list(response_json['topics'].keys())[2]}</mark></mark>"
	html += f"<p>"
	html += f"<mark style='background-color: {colors[0]}; margin-right: 20px;'><mark style='background-color: {colors[1]};'><mark style='background-color: {colors[2]};'>{list(response_json['topics'].keys())[0]} + {list(response_json['topics'].keys())[1]} + {list(response_json['topics'].keys())[2]}</mark></mark></mark>"
	html += f"</p>"
	html += "<h1>Topics</h1>"
	for i, (topic, topic_data) in enumerate(result_json['topics'].items()):
		html += f"<h2><mark style='background-color: {colors[i]};'>{topic}</mark> ({topic_data['topicLikelihood']} likelihood, {len(topic_data['indices'])} occurrences)</h2>"
		if len(topic_data['indices']) > 0 and "explanation" in topic_data:
			html += f"<p><b>Explanation:</b> {topic_data['explanation']}</p>"
		html += f"<ul>"
		for indices in topic_data['indices']:
			start = indices[0]
			end = indices[1]
			html += f"<li><p><b>[{start}:{end + 1}]</b> {article_text[start:end + 1]}</p></li>"
		html += f"</ul>"
	html += "</body>"
	return html

html = generate_highlighted_html(chosen_article, response_json)
html_print = html.replace("<", "\n<")
print(f"HTML:\n{html_print}")
with open(path_html_preview, "w") as f:
	f.write(html)