In [None]:
import os; os.chdir('../')
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from rouge_score import rouge_scorer
from utils.config import load_config
from ast import literal_eval as le
from nltk.translate import meteor
from nltk import word_tokenize
from tqdm import tqdm
import nltk
# nltk.download('punkt')
# nltk.download('punkt_tab')
from src.openai_evaluator import GPT4Accuracy, GPT4SingleValue
from dotenv import load_dotenv; load_dotenv('.env')
import re

In [None]:
cfg = load_config('config/climate.yaml')

In [None]:
def getRMSEScore(pred_values, fut_values):
    y_pred = np.reshape(pred_values, -1)
    y_true = np.reshape(fut_values, -1)
    y_pred = np.array(y_pred, dtype=np.float64)
    y_true = np.array(y_true, dtype=np.float64)
    
    return np.sqrt(np.mean(np.square(y_pred - y_true)))

def getMeteorScore(outputs, pred_outputs):
    scores = [meteor([word_tokenize(output)], word_tokenize(pred_output)) 
              for output, pred_output in tqdm(zip(outputs, pred_outputs), 
                                              total=len(outputs), 
                                              desc="Calculating METEOR scores")]
    mean_score = np.mean(scores)
    
    return mean_score

cos_model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
def getCosineSimilarity(outputs, pred_outputs):
    cos_sims = [cos_sim(x, y) 
                for x, y in tqdm(zip(cos_model.encode(outputs), cos_model.encode(pred_outputs)), 
                                 total=len(outputs), 
                                 desc="Calculating Cosine Similarities")]
    return np.mean(cos_sims)

def getROUGEScore(outputs, pred_outputs):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for output, pred_output in zip(outputs, pred_outputs):
        scores = scorer.score(output, pred_output)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    mean_rouge1 = np.mean(rouge1_scores)
    mean_rouge2 = np.mean(rouge2_scores)
    mean_rougeL = np.mean(rougeL_scores)
    
    return mean_rouge1, mean_rouge2, mean_rougeL

def split_forecast_by_day(text, output_window):
    days = re.split(r'(day_\d+_date:)', text)
    days = [days[i] + days[i+1] for i in range(1, len(days)-1, 2)]
    days = [day.strip() for day in days][:output_window]
    return days

In [None]:
# evaluator = GPT4Accuracy(description="multimodal_medical_bge_2")
evaluator = GPT4SingleValue(description="multimodal_climate_single")
all_windows = [1, 2, 3, 4, 5, 6, 7]

In [None]:
batch_object_map = {}
root_dir = "results_climate/hybrid_stage_3"

for w in all_windows:
    input_window=w
    output_window=w

    df = pd.read_csv(f"{root_dir}/{input_window}_{output_window}/hybrid_results.csv")

    df['input_times'] = df['input_times'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ').tolist())
    df['output_times'] = df['output_times'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ').tolist())
    df['input_dates'] = df['input_dates'].apply(le)
    df['output_dates'] = df['output_dates'].apply(le)
    df['input_texts'] = df['input_texts'].apply(le)
    df['output_texts'] = df['output_texts'].apply(le)
    df['pred_times'] = df['pred_times'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ').tolist())

    # format ground truth text
    output_texts = []
    for text, date in zip(df['output_texts'], df['output_dates']):
        output_text = []
        for i in range(output_window):
            date_template = cfg['date_template'].format(index=input_window+1+i)
            text_template = cfg['text_template'].format(index=input_window+1+i)
            output_text.append(f"{date_template}: {date[i]}\n{text_template}: {text[i]}")
        output_texts.append('\n'.join(output_text))

    df['output_texts'] = output_texts


    pred_texts = df['pred_texts'].apply(lambda x: split_forecast_by_day(x, w)).tolist()
    output_texts = [split_forecast_by_day(text, w) for text in output_texts]
    split_data = []
    for pred, output in zip(pred_texts, output_texts):
        for p, o in zip(pred, output):
            split_data.append({'pred_text': p, 'output_text': o})

    df_evaluation = pd.DataFrame(split_data)

    rmse_score = getRMSEScore(np.stack(np.array(df['output_times'])), np.stack(np.array(df['pred_times'])))
    cosine_score = getCosineSimilarity(df_evaluation['output_text'], df_evaluation['pred_text'])
    meteor_score = getMeteorScore(df_evaluation['output_text'], df_evaluation['pred_text'])
    rouge_1_score, rouge_2_score, rouge_n_score = getROUGEScore(df_evaluation['output_text'], df_evaluation['pred_text'])

    print(f"-------{w}----------")
    print(round(rmse_score, 3))
    print(round(cosine_score, 3))
    print(round(meteor_score, 3))
    print(round(rouge_1_score, 3), round(rouge_2_score, 3), round(rouge_n_score, 3))

    jsonl_path = f"{root_dir}/{input_window}_{output_window}/accuracy_evaluator.jsonl"
    batch_object_id = evaluator.create_and_run_batch_job(df_evaluation, jsonl_path)
    batch_object_map[w] = batch_object_id

In [None]:
[(x.status, x.request_counts, x.id) for x in evaluator.client.batches.list().data[:len(all_windows)]]

In [None]:
def calculate_single_value(parsed_outputs):
    gpt_scores = []
    for output in parsed_outputs:
        number = re.findall(r'\d+', output)
        if len(number) > 0:
            gpt_score = float(number[0])
        else:
            gpt_score = 0
        gpt_scores.append(gpt_score)

    return np.mean(gpt_scores)

In [None]:
for id in all_windows:
    output_path = f"{root_dir}/{id}_{id}/accuracy_evaluator.txt"
    parsed_outputs = evaluator.check_status_and_parse(batch_object_map[id], output_path)

    precision, recall, f1 = evaluator.calculate_metrics(parsed_outputs)
    print(f'--------{id}----------')
    print(round(precision, 3))
    print(round(recall, 3))
    print(round(f1, 3))

    single_score = calculate_single_value(parsed_outputs)
    print(round(single_score, 3))