In [2]:
from transformers import AutoTokenizer, TapasForQuestionAnswering
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")

data = {
    "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
    "Age": ["56", "45", "59"],
    "Number of movies": ["87", "53", "69"],
}


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
table = pd.DataFrame.from_dict(data)
queries = ["what is average age of actors?", "How old is Brad Pitt?"]

In [4]:
inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
outputs = model(**inputs)
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs,
    outputs.logits.detach(),
    outputs.logits_aggregation.detach(),
)
print(predicted_answer_coordinates)

[[(0, 1), (1, 1), (2, 1)], [(0, 1)]]


In [5]:
answers = []
for coordinates, query in zip(predicted_answer_coordinates, queries):
    row, col = coordinates[0] 
    answer_text = table.iloc[row, col] 
    answers.append(f"Answer to '{query}': {answer_text}")

print(*answers, sep="\n")

Answer to 'what is average age of actors?': 56
Answer to 'How old is Brad Pitt?': 56


In [6]:
def get_answer(table,queries):
    inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
    outputs = model(**inputs)
    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
    )

    # let's print out the results:
    id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
    aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell:
            answers.append(table.iat[coordinates[0]])
        else:
            # multiple cells
            cell_values = []
            for coordinate in coordinates:
                cell_values.append(table.iat[coordinate])
            answers.append(", ".join(cell_values))

    display(table)
    print("")
    for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
        print(query)
        if predicted_agg == "NONE":
            print("Predicted answer: " + answer)
        else:
            if predicted_agg == "AVERAGE":
                ans = sum([int(x) for x in answer.split(", ")]) / len(answer.split(", ")) 
                print("Predicted answer: " + str(ans))
            elif predicted_agg == "SUM":
                ans = sum([int(x) for x in answer.split(", ")])
                print("Predicted answer: " + str(ans))
            elif predicted_agg == "COUNT":
                ans = len(answer.split(", "))
                print("Predicted answer: " + str(ans))

In [7]:
queries = [
    "total production of urea in 2019 in tonnes?",
]
table = pd.read_csv("FertilizersProduct/FertilizersProduct.csv", encoding='latin-1')
# convert every columns data type to string
table = table[:100]
table = table.astype(str)
table = table.replace('nan', '0')
table = table.replace('Nan', '0')
table = table.replace('NaN', '0')
get_answer(table,queries)


Token indices sequence length is longer than the specified maximum sequence length for this model (2463 > 512). Running this sequence through the model will result in indexing errors.


KeyboardInterrupt: 