In [None]:
import os 
import json
from datetime import datetime
import pandas as pd
from openai import OpenAI
from typing import Union, List

In [None]:
OPEN_AI_KEY=os.environ.get('OPEN_AI_KEY')
ORGANISATION_ID=os.environ.get('ORGANISATION_ID')
PROJECT_ID=os.environ.get('PROJECT_ID')

In [None]:
def get_filename_without_extension(file_path: str) -> str:

    base_name = os.path.basename(file_path)
    file_name, _ = os.path.splitext(base_name)
    
    return file_name

In [None]:
MODEL = 'gpt-3.5-turbo' # gpt-4o

HOME_DIR = '/usr/src/app'

MESSAGES_DIR = f'{HOME_DIR}/resources/system_messages'
SYSTEM_MESSAGE_PATH = f'{MESSAGES_DIR}/SYSTEM_MESSAGE_2.txt'

EXCEL_PATH = f'{HOME_DIR}/data/sample_data.xlsx'

REQUIRED_KEYS= ["is_news", "is_in_country", "is_in_risk_category", "is_commodity", "summary"]

OUTPUT_FILEPATH = f'{HOME_DIR}/data/nlp/{MODEL.replace("-", "_")}_{get_filename_without_extension(SYSTEM_MESSAGE_PATH)}.xlsx'

In [None]:
client = OpenAI(
    api_key=OPEN_AI_KEY, 
    organization=ORGANISATION_ID, 
    project=PROJECT_ID
)

In [None]:
def get_completion(
    prompt: str,
    system_message: str = "You are a helpful assistant.",
    model: str = "gpt-4o",
    temperature: float = 0.3,
    top_p: Union[float, int] = 1,
    response_format_type: str = 'json_object',
) -> Union[str, dict]:
    
    response = client.chat.completions.create(
        model=model,
        temperature=temperature,
        top_p=top_p,
        response_format={"type": response_format_type},
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
    )
    return response

In [None]:
def convert_unixtime_to_datetime(unixtime: Union[int, float]) -> str:
    """Converts Unix time to a datetime object with seconds precision."""
    return datetime.fromtimestamp(unixtime).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
def read_system_message(system_message_path: str) -> str: 
    with open(system_message_path, 'r') as file:
        system_message = file.read()

    return system_message

In [None]:
def perform_nlp_part(web_scrapping_results: pd.DataFrame, 
                     system_message:str, 
                     model:str, 
                     required_keys: List[str], 
                     temperature: float = 0.3, 
                     top_p: Union[float, int] = 1, 
                     response_format_type: str = 'json_object') -> List[List[str]]:
    
    result=[]
    
    for row in web_scrapping_results.itertuples(index=True, name='Pandas'):
        prompt = json.dumps({
            'country':row.country, 
            'commodity':row.commodity, 
            'text':row.article_clean_text
        })
    
        res = get_completion(system_message, prompt, model, temperature, top_p, response_format_type)
        choice = res.choices[0]
        text_output = json.loads(choice.message.content)
    
        social_risk_analysis_results = {key: None for key in required_keys}
        social_risk_analysis_results['bad_response'] = None
        
        openai_response_metadata = {
            'openai_response_status': choice.finish_reason,
            'model': res.model,
            'created': convert_unixtime_to_datetime(res.created),
            'prompt_token_count': res.usage.prompt_tokens,
            'completion_tokens': res.usage.completion_tokens,
            'total_tokens': res.usage.total_tokens
        }
    
        something_went_wrong = False
    
        if choice.finish_reason == "stop":
            for key in required_keys:
                if key not in text_output.keys():
                    something_went_wrong = True 
                    social_risk_analysis_results[key] = 'key_not_present'
                else:
                    social_risk_analysis_results[key] = text_output[key]
    
            if something_went_wrong: 
                social_risk_analysis_results['bad_response']=text_output
        else:
            social_risk_analysis_results['bad_response']=text_output
    
        result.append([
            *list(social_risk_analysis_results.values()),
            openai_response_metadata['openai_response_status'],
            openai_response_metadata['model'], 
            openai_response_metadata['created'], 
            openai_response_metadata['prompt_token_count'], 
            openai_response_metadata['completion_tokens'], 
            openai_response_metadata['total_tokens']
        ])

    return result

In [None]:
def write_results(web_scrapping_results: pd.DataFrame, 
                  nlp_results: List[List[str]], 
                  required_keys: List[str],
                  output_filepath: str
                 ) -> pd.DataFrame: 
    
    nlp_schema = required_keys + ['bad_response'] + ['openai_response_status', 'model', 'created', 'prompt_token_count', 'completion_tokens', 'total_tokens']

    nlp_results_df = pd.DataFrame(nlp_results, columns=nlp_schema)

    web_scrap_and_nlp_df = pd.concat([web_scrapping_results, nlp_results_df], axis=1)

    web_scrap_and_nlp_df.to_excel(output_filepath, index=False)

    return web_scrap_and_nlp_df

In [None]:
sys_message = read_system_message(SYSTEM_MESSAGE_PATH)
web_scrapping_results = pd.read_excel(EXCEL_PATH)

In [None]:
%%time
nlp_results = perform_nlp_part(web_scrapping_results[:4], sys_message, MODEL, REQUIRED_KEYS)

In [None]:
results_df = write_results(web_scrapping_results, nlp_results, REQUIRED_KEYS, OUTPUT_FILEPATH)