In [1]:
import json

mapping_path = "themes.json"
with open(mapping_path, 'r') as json_file:
    themes = json.load(json_file)

In [2]:
main_theme = list(themes.keys())[0]
secondary_theme = themes[main_theme][0]

In [3]:
secondary_theme

'Gross Scope 1 GHG emissions'

In [4]:
year = '2023'

mapping_path = "mapping_theme_description.json"
with open(mapping_path, 'r') as json_file:
    mapping_theme_description = json.load(json_file)
    
description_theme = mapping_theme_description[secondary_theme]

In [5]:
description_theme

'Total greenhouse gas (GHG) emissions resulting directly from sources owned or controlled by the company, typically expressed in metric tons of CO2 equivalent.'

# Inference

In [6]:
from inference import extract_info
import os
from tqdm.auto import tqdm
import pandas as pd
# Path to the Documents folder
documents_folder = os.path.expanduser("documents")

# Get all PDF file paths in the Documents folder
pdf_files = [
    os.path.join(documents_folder, file)
    for file in os.listdir(documents_folder)
    if file.endswith(".pdf")
]

# Generate result_df by processing each company's PDF
result_data = []
# Loop through each PDF file and process
for file_path in tqdm(pdf_files):
    name = os.path.splitext(os.path.basename(file_path))[0]
    path_vectorstore = os.path.join("vectorstores", f"Vectorstore_{name}")
    result = extract_info(path_vectorstore, secondary_theme, description_theme, year)
    try:
        extract_data = result
        result_data.append(
            {
                "company_name": name,
                "value": extract_data.value,
                "unit": extract_data.unit,
                "page": extract_data.page,
                "extract": extract_data.extract,
                "extract_type": extract_data.extract_type,
                "paragraph": extract_data.paragraph,
                "num_extracts": 1#len(result),
            }
        )
    except Exception as e:
        print(f"{name}: {result}")
        result_data.append(
            {
                "company_name": name,
                "value": 0,
                "unit": "",
                "page": 0,
                "extract": "",
                "extract_type": "",
                "paragraph": "",
                "num_extracts":0,

            }
        )

# Create the result_df with the extracted information
result_df = pd.DataFrame(result_data)

  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
saving_path = os.path.join('results',f'{secondary_theme}_{year}.csv')
result_df.to_csv(saving_path, index=False)

In [9]:
result_df

Unnamed: 0,company_name,value,unit,page,extract,extract_type,paragraph,num_extracts
0,Eurazeo,,,,,,,1
1,Abertis,,,,,,,1


# Truth df

In [10]:
# Load the Excel file
import json

import pandas as pd

# Specify the path to your JSON file
file_path = "ground_truth.json"

# Open the file and load the JSON data
with open(file_path, "r") as json_file:
    data = json.load(json_file)

truth_df = pd.DataFrame(data[main_theme][secondary_theme]).transpose()
truth_df.index.name = "company_name"
truth_df = truth_df.reset_index()
truth_df['value'] = pd.to_numeric(truth_df['value'], errors='coerce')
truth_df['page'] = pd.to_numeric(truth_df['page'], errors='coerce')


In [11]:
truth_df

Unnamed: 0,company_name,page,paragraph,value,unit,extract,extract_type
0,Crédit Agricole,164,5.5 MESURES LIÉES À L’EMPREINTE ENVIRONNEMENTA...,26434.0,tCO2e,ÉMISSIONS DE GAZ À EFFET DE SERRE (GES),tableau
1,BNP,696,7 engagement 1,25437.0,tCO2e,RÉPARTITION DE CES ÉMISSIONS PAR SCOPE DU GHG ...,graphique
2,Société Générale (assurance),23,6,61.0,tonne équivalent CO2,Évolutions des émissions de CO₂ (tonnes équiva...,tableau
3,Schneider Electric,312,2.8,112792.0,tCO2e,Gaz à effet de serre (GES),tableau
4,Michelin,183,4.1.1 a),1.04,millions de tonnes de CO2,"INVENTAIRE DES ÉMISSIONS DE CO2 SCOPES 1, 2 ET 3",
5,Carrefour,33,2.1.4.1,782709.0,tCO2e,Scopes 1 et 2 : contribuer à la neutralité car...,tableau
6,Dassault système,114,4.7.4,27186.0,tCO2e,Renforcer le plan bas carbone Société en cohér...,tableau
7,Engie,113,3.5.4.1,24496514.0,tCO2e,Émissions directes,tableau
8,Sanofi,189,3.3.9.3.1,297700.0,tCO2e,Gaz à effet de serre,tableau
9,Danone,184,5.3 Nature,573.0,ktCO2e,Émissions des scopes 1 et 2 (énergie et indust...,tableau


## Scoring

In [12]:
saving_path = os.path.join('results',f'{secondary_theme}_{year}.csv')
result_df = pd.read_csv(saving_path)

In [16]:
from scoring import (
    results_comparative_table,
    calculate_accuracy_for_coherent_units,
    calculate_accuracy_for_sources,
    results_comparative_table,
    calculate_precision_recall
)

ModuleNotFoundError: No module named 'sklearn'

In [17]:
merged_df = results_comparative_table(truth_df, result_df, with_metric_chain = True)

NameError: name 'results_comparative_table' is not defined

In [18]:
merged_df["subtheme"] = secondary_theme
merged_df["theme"] = main_theme
merged_df["source_accuracy"] = calculate_accuracy_for_sources(merged_df)
merged_df["unit_coherence_accuracy"] = calculate_accuracy_for_coherent_units(merged_df)
precision_recall = calculate_precision_recall(merged_df)
merged_df["precision"] = precision_recall[0]
merged_df["recall"] = precision_recall[1]
merged_df["f1_score"] = precision_recall[2]
merged_df["value_accuracy"] = precision_recall[3]
merged_df["true_positive"] = precision_recall[4]
merged_df["false_positive"] = precision_recall[5]
merged_df["true_negative"] = precision_recall[6]
merged_df["false_negative"] = precision_recall[7]


NameError: name 'merged_df' is not defined

In [10]:
saving_path = os.path.join("results", f"comparative_{secondary_theme}_{year}.csv")
merged_df.to_csv(saving_path, sep="|", index=False, encoding="utf-8")

# Debug

### False positive

In [19]:
merged_df.loc[
    (
        (merged_df["value_result_scale"] != 0)
        & (merged_df["value_truth"] != merged_df["value_result_scale"])
    )
][
    [
        "company_name",
        "value_result",
        "unit_result",
#        "value_result_scale",
        "value_truth",
        "unit_truth",
        "page_result",
        "page_truth",
        "num_extracts",
#        "extract_result"
    ]
]

NameError: name 'merged_df' is not defined

### False negative

In [20]:
merged_df.loc[
    (merged_df["value_result_scale"] == 0) & ~(merged_df["value_truth"].isna())
][
    [
        "company_name",
        "value_result",
        "value_result_scale",
        "value_truth",
        "page_result",
        "page_truth",
    ]
]

NameError: name 'merged_df' is not defined

In [21]:
import os
import zipfile

# Specify the name of the zip file
zip_file_name = "all_files.zip"

# Create a new zip file
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    # Iterate over all the files in the current directory
    for file_name in os.listdir('.'):
        # Check if it is a file (not a directory)
        if os.path.isfile(file_name):
            # Add the file to the zip archive
            zipf.write(file_name)

print(f"All files in the current directory have been zipped into '{zip_file_name}'.")


All files in the current directory have been zipped into 'all_files.zip'.
