In [None]:
# import from the files in this directory
from dotenv import load_dotenv
from orchestrator import *
import requests
import os
import json
import pandas as pd
import re
import time
from experiment_utils import *

In [None]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)
orchestrator.setup_data_fetcher()
ground_truth_Expert_df = False
use_past_predictions = False
predict = True

In [None]:
performance = pd.read_csv('exp_output/performance.csv')

In [None]:
if ground_truth_Expert_df:
    fetched_data = pd.read_csv("exp_ground_truth/ground_truth_Expert_df.csv")
else:   
    fetched_data = pd.read_parquet("exp_input/fetched_data.parquet")

In [None]:
if use_past_predictions:
    predict_df = pd.read_csv('exp_output/predicted_df.csv')
    past_pred = predict_df.copy()
    orchestrator.logger.info(f"Loaded {len(past_pred)} past predictions.")
    orchestrator.logger.info(f"Past predictions: {past_pred}")

In [None]:
ground_truth = fetched_data[['publication', 'dataset_uid', 'repo_name', 'raw_html']]
print(len(ground_truth))
ground_truth.head()

In [None]:
if predict:
    results = {}
    start_time = time.time()
    log_every = len(fetched_data) // 200 
    total_iters = len(fetched_data)  # Total number of URLs
    predict_df = pd.DataFrame()
    
    try:
        for i,row in fetched_data.iterrows():
            orchestrator.current_url = row['publication']
            if use_past_predictions and orchestrator.current_url in past_pred['source_url'].values and orchestrator.current_url not in redo['publication'].values:
                orchestrator.logger.info(f"URL {i+1} has already been processed. Skipping.\n\n")
                continue
    
            orchestrator.logger.info(f"Processing URL {i+1}: {orchestrator.current_url}")
            orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(orchestrator.current_url)
            orchestrator.logger.info(f"Publisher: {orchestrator.publisher}")
            #orchestrator.logger.info("Using LLMParser to parse data.")
            orchestrator.parser = LLMParser(orchestrator.XML_config, orchestrator.logger)
            raw_data = row['raw_html']
            parsed_data = orchestrator.parser.parse_data(raw_data, orchestrator.publisher, orchestrator.current_url, raw_data_format="full_HTML")
            parsed_data['source_url'] = orchestrator.current_url
            orchestrator.logger.info(f"Parsed data extraction completed. Elements collected: {len(parsed_data)}")
            results[orchestrator.current_url] = parsed_data
            predict_df = pd.concat([predict_df, parsed_data], ignore_index=True)
            
            if (i + 1) % log_every == 0:
                elapsed = time.time() - start_time  # Time elapsed since start
                avg_time_per_iter = elapsed / (i + 1)  # Average time per iteration
                remaining_iters = total_iters - (i + 1)
                estimated_remaining = avg_time_per_iter * remaining_iters  # Estimated time remaining
                orchestrator.logger.info(
                    f"\nProgress: {i+1}/{total_iters} ({(i+1)/total_iters*100:.2f}%) "
                    f"| Elapsed: {time.strftime('%H:%M:%S', time.gmtime(elapsed))} "
                    f"| ETA: {time.strftime('%H:%M:%S', time.gmtime(estimated_remaining))}\n"
                )
            
            time.sleep(2)
    
    except Exception as e:
        orchestrator.logger.error(f"Error processing URL {orchestrator.current_url}: {e}")
        orchestrator.logger.error(f"URL: {orchestrator.current_url}")
        orchestrator.logger.error(f"Publisher: {orchestrator.publisher}")
        orchestrator.logger.error(f"Data: {parsed_data}")
        raise e
        
else:
    predict_df = pd.read_csv('exp_output/predicted_df.csv')

In [None]:
print(len(past_pred),past_pred.columns)

In [None]:
# Convert the results dictionary values to a single DataFrame
df_results = pd.concat(results.values(), ignore_index=True)
df_results.head()

In [None]:
len(df_results)

In [None]:
# Concatenate with past_pred DataFrame
past_pred = pd.concat([past_pred, df_results], ignore_index=True)

In [None]:
past_pred

In [None]:
past_pred[-2:]

In [None]:
past_pred.to_csv('exp_output/predicted_df.csv', index=False)

In [None]:
predict_df = past_pred.copy()

In [None]:
# Ensure dataset_uid is a string and replace NaN with an empty string
predict_df['dataset_identifier'] = predict_df['dataset_identifier'].astype(str).replace('nan', '')

# Drop rows where dataset_uid is still NaN (if needed)
predict_df = predict_df.dropna(subset=['dataset_identifier'])

# Trim spaces and ensure no unintended whitespace
predict_df['dataset_identifier'] = predict_df['dataset_identifier'].str.strip()

# Now, call evaluation
performance_metrics = evaluate_performance(predict_df, ground_truth, orchestrator, orchestrator.config['false_positives_file'])
print(performance_metrics)

In [None]:
# **Usage**
performance_metrics = evaluate_performance(predict_df, ground_truth, orchestrator, orchestrator.config['false_positives_file'])
print(performance_metrics)