<a href="https://colab.research.google.com/github/woranov/amlta-project/blob/main/notebooks/flows_tapas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    import amlta
except ImportError:
    %pip install --no-dependencies git+https://github.com/woranov/amlta-project.git

In [2]:
import warnings
from pathlib import Path

try:
    from google.colab import drive  # pyright: ignore[reportMissingImports]

    IN_COLAB = True
except ImportError:
    IN_COLAB = False


import pandas as pd
import torch
from transformers import TapasForQuestionAnswering, TapasTokenizer

from amlta.config import config
from amlta.probas import extract, processes

In [3]:
if IN_COLAB:
    mount_point = Path("/content/drive")
    drive_path = mount_point / "MyDrive"

    # edit
    data_dir = drive_path / "uni" / "ws2425" / "amlta" / "project" / "data"

    config.update(data_dir=data_dir)

    if not mount_point.exists():
        drive.mount(str(mount_point))

In [4]:
# by default lci result only
process_uuids = processes.read_uuids()

In [5]:
# get a random one
uuid = process_uuids[42]
# load process
(process,) = processes.ProcessData.from_uuids([uuid])

In [6]:
print(process.processInformation.dataSetInformation.name)

baseName=[LocalizedText(value='Öl-roh-mix-HR-2005', lang='de')]


In [7]:
# extract process flows to dataframe
flows_df = extract.extract_process_flows(process)
flows_df

Unnamed: 0,exchange_direction,exchange_resulting_amount,exchange_type_of_flow,exchange_classification_hierarchy,flow_uuid,flow_description,flow_property_uuid,flow_property_name,flow_property_unit
0,INPUT,7.050000e+02,Waste flow,End-of-life treatment / Energy recycling,814cedc6-b3ec-4474-86fa-051740192dec,secondary raw materials,93a60a56-a3c8-11da-a746-0800200c9a66,Net calorific value,MJ
1,INPUT,2.280000e-03,Waste flow,End-of-life treatment / Material recycling,0e0b2476-9043-11d3-b2c8-0080c8941b49,secondary raw materials,93a60a56-a3c8-11da-a746-0800200b9a66,Mass,kg
2,INPUT,-7.080000e-05,Waste flow,End-of-life treatment / Other end-of-life serv...,0e0b2403-9043-11d3-b2c8-0080c8941b49,NF-scrap,93a60a56-a3c8-11da-a746-0800200b9a66,Mass,kg
3,INPUT,-1.730000e+00,Waste flow,End-of-life treatment / Other end-of-life serv...,a22c8047-fd56-49a4-8dd2-0c0e54c60cc6,waste,93a60a56-a3c8-11da-a746-0800200c9a66,Net calorific value,MJ
4,INPUT,1.630000e-07,Waste flow,End-of-life treatment / Material recycling,cc0e481c-80da-11d4-9e81-0080c8426c9a,Fe-scrap,93a60a56-a3c8-11da-a746-0800200b9a66,Mass,kg
...,...,...,...,...,...,...,...,...,...
65,OUTPUT,4.270000e+01,Elementary flow,Emissions / Emissions to air / Emissions to ai...,08a91e70-3ddc-11dd-9610-0050c2490048,methane (fossil),93a60a56-a3c8-11da-a746-0800200b9a66,Mass,kg
66,OUTPUT,1.110000e-05,Elementary flow,Emissions / Emissions to air / Emissions to ai...,5c69d34a-51e7-4e7d-a89c-c8536962b51a,cadmium (ii),93a60a56-a3c8-11da-a746-0800200b9a66,Mass,kg
67,OUTPUT,-8.550000e-12,Elementary flow,Emissions / Emissions to water / Emissions to ...,0d8a8cbb-f28e-4af9-8ddb-d4fed17e6d57,cadmium (ii),93a60a56-a3c8-11da-a746-0800200b9a66,Mass,kg
68,OUTPUT,1.900000e-05,Elementary flow,Emissions / Emissions to air / Emissions to ai...,08a91e70-3ddc-11dd-91d7-0050c2490048,arsenic,93a60a56-a3c8-11da-a746-0800200b9a66,Mass,kg


In [None]:
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "google/tapas-large-finetuned-wtq"
model = TapasForQuestionAnswering.from_pretrained(model_name).to(device)
tokenizer = TapasTokenizer.from_pretrained(model_name, device=device)

In [9]:
table = (
    flows_df[
        [
            "exchange_direction",
            "exchange_resulting_amount",
            "exchange_type_of_flow",
            "exchange_classification_hierarchy",
            "flow_description",
            "flow_property_name",
            "flow_property_unit",
        ]
    ]
    .rename(
        columns={
            "exchange_direction": "Direction",
            "exchange_resulting_amount": "Amount",
            "exchange_type_of_flow": "Type",
            "exchange_classification_hierarchy": "Class",
            "flow_description": "Name",
            "flow_property_name": "Property",
            "flow_property_unit": "Unit",
        }
    )
    .replace(
        {
            "Direction": {
                "INPUT": "Input",
                "OUTPUT": "Output",
            },
        }
    )
    .pipe(
        lambda df: df.assign(
            **{col: df[col].round(2) for col in df.select_dtypes("number").columns}
        )
    )
)

In [10]:
table

Unnamed: 0,Direction,Amount,Type,Class,Name,Property,Unit
0,Input,705.00,Waste flow,End-of-life treatment / Energy recycling,secondary raw materials,Net calorific value,MJ
1,Input,0.00,Waste flow,End-of-life treatment / Material recycling,secondary raw materials,Mass,kg
2,Input,-0.00,Waste flow,End-of-life treatment / Other end-of-life serv...,NF-scrap,Mass,kg
3,Input,-1.73,Waste flow,End-of-life treatment / Other end-of-life serv...,waste,Net calorific value,MJ
4,Input,0.00,Waste flow,End-of-life treatment / Material recycling,Fe-scrap,Mass,kg
...,...,...,...,...,...,...,...
65,Output,42.70,Elementary flow,Emissions / Emissions to air / Emissions to ai...,methane (fossil),Mass,kg
66,Output,0.00,Elementary flow,Emissions / Emissions to air / Emissions to ai...,cadmium (ii),Mass,kg
67,Output,-0.00,Elementary flow,Emissions / Emissions to water / Emissions to ...,cadmium (ii),Mass,kg
68,Output,0.00,Elementary flow,Emissions / Emissions to air / Emissions to ai...,arsenic,Mass,kg


In [11]:
def retrieve_chunk(table, query, threshold=0.5):
    inputs = tokenizer(
        table=table, queries=query, padding="max_length", return_tensors="pt"
    )
    outputs = model(**inputs.copy().to(device))

    predicted_answer_coordinates, predicted_aggregation_indices = (
        tokenizer.convert_logits_to_predictions(
            inputs,
            outputs.logits.detach().cpu(),
            outputs.logits_aggregation.detach().cpu(),
            cell_classification_threshold=threshold,
        )
    )

    id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
    aggregation_predictions_string = [
        id2aggregation[x] for x in predicted_aggregation_indices
    ]

    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell:
            answers.append(table.iat[coordinates[0]])
        else:
            # multiple cells
            cell_values = []
            for coordinate in coordinates:
                cell_values.append(table.iat[coordinate])
            answers.append(cell_values)

    return {
        "answers": answers,
        "coordinates": predicted_answer_coordinates,
        "aggregation": aggregation_predictions_string,
    }


def retrieve(table, query, rows=25, threshold=0.5):
    answers = []
    answers_dfs = []

    # table = table.sample(frac=1)

    for i in range(0, len(table), rows):
        table_chunk = table.iloc[i : i + rows].copy().reset_index(drop=True).astype(str)
        answer = retrieve_chunk(table_chunk, query, threshold=threshold)
        answers.append(answer)
        if answer and answer["coordinates"]:
            answers_dfs.append(
                table_chunk.iloc[[row_loc for (row_loc, _) in answer["coordinates"][0]]]
            )

    answer_df = pd.concat(answers_dfs)
    return answers, answer_df

In [19]:
answers, answer_df = retrieve(
    table, "What is the output amount of carbon dioxide?", threshold=0.99
)
print(answers)

answer_df

[{'answers': ['4.5'], 'coordinates': [[(22, 1)]], 'aggregation': ['SUM']}, {'answers': [[]], 'coordinates': [[]], 'aggregation': ['SUM']}, {'answers': ['6809.0'], 'coordinates': [[(13, 1)]], 'aggregation': ['SUM']}]


Unnamed: 0,Direction,Amount,Type,Class,Name,Property,Unit
22,Output,4.5,Elementary flow,Emissions / Emissions to air / Emissions to ai...,particles (PM10),Mass,kg
13,Output,6809.0,Elementary flow,Emissions / Emissions to air / Emissions to ai...,carbon dioxide (fossil),Mass,kg
