In [3]:
from transformers import AutoTokenizer, TapasForQuestionAnswering
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")

data = {
    "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
    "Age": ["56", "45", "59"],
    "Number of movies": ["87", "53", "69"],
}


In [4]:
table = pd.DataFrame.from_dict(data)
queries = ["what is average age of actors?", "How old is Brad Pitt?"]

In [5]:
inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
outputs = model(**inputs)
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs,
    outputs.logits.detach(),
    outputs.logits_aggregation.detach(),
)
print(predicted_answer_coordinates)

[[(0, 1), (1, 1), (2, 1)], [(0, 1)]]


In [6]:
answers = []
for coordinates, query in zip(predicted_answer_coordinates, queries):
    row, col = coordinates[0] 
    answer_text = table.iloc[row, col] 
    answers.append(f"Answer to '{query}': {answer_text}")

print(*answers, sep="\n")

Answer to 'what is average age of actors?': 56
Answer to 'How old is Brad Pitt?': 56


In [26]:
def get_answer(table,queries):
    inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
    outputs = model(**inputs)
    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
    )

    # let's print out the results:
    # find number of unique values in predicted_aggregation_indices
    i = set(predicted_aggregation_indices)
    print("first predicted_answer_coordinates: ", i)
    print("Number of unique values in predicted_aggregation_indices: ", len(i))

    id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
    aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell:
            answers.append(table.iat[coordinates[0]])
        else:
            # multiple cells
            cell_values = []
            for coordinate in coordinates:
                cell_values.append(table.iat[coordinate])
            answers.append(", ".join(cell_values))

    display(table)
    print("")
    for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
        print(query)
        if predicted_agg == "NONE":
            print("Predicted answer: " + answer)
        else:
            if predicted_agg == "AVERAGE":
                ans = sum([float(x) for x in answer.split(", ")]) / len(answer.split(", ")) 
                print("Predicted answer: " + str(ans))
            elif predicted_agg == "SUM":
                ans = sum([float(x) for x in answer.split(", ")])
                print("Predicted answer: " + str(ans))
            elif predicted_agg == "COUNT":
                ans = len(answer.split(", "))
                print("Predicted answer: " + str(ans))

In [38]:
queries = [
    # "what is product from RCF Import which has minimum Remeaining stock?",
    "which plant has minimum sum of Pending acknwoledgment and minimum Remaining stock?",
]
table = pd.read_csv("dummy.csv")
# only keep [table["State"], table["TotalPopulation"], table["Name of fertilizer used"], table["Required fertilizer quantity"],table["Available fertilizer quantity"]] and remove everything else
# table = table[["State", "TotalPopulation", "Name of the fertilizer used", "Required fertilizer quantity", "Available fertilizer quantity"]]
# table = table.drop(columns=['Date', 'Subvertical'])
table = table[:10]
table = table.astype(str)
get_answer(table,queries)


first predicted_answer_coordinates:  {0}
Number of unique values in predicted_aggregation_indices:  1


Unnamed: 0,Plant,Product,Pending acknwoledgment,Remaining Stock,Aging range : 31-40,Aging range: 61-90,Aging range: 91-120,Aging range: 121-130,Aging range: 151-180,Aging range: greater than 180
0,RCF Trombay,15-15-15,24455.0,2900839.0,2790.8,1042.98,2208.09,58731.0,3840.15,6385.27
1,RCF Compost,City Compest,330.0,1894.35,0.0,0.0,0.0,0.0,0.0,1094.35
2,RCF Import,Imported 10-26-20,30.0,790.61,60.2,255.0,122.71,225.25,68.5,45.35
3,RCF import,Imported 15-15-15,54335.0,140605.0,1486.05,45643.0,19662.0,4637.75,1912.0,155.0
4,RCF import,Imported 20 10013,383.75,14031.75,106215.0,1202.45,523.0,33.75,35.0,145.0
5,RCF Import,Imported DAP,1747.2,2205873.0,2948.38,45643.0,19662.0,4637.75,1912.0,155.0
6,RCF Tha,Neem Coated Urea(45 Kg),1222241.0,375018.0,12203.34,3761.32,427439.0,2002.04,2942.45,7495.54
7,RCF Trombay,Neem Coated Urea(45 Kg),1000.0,0.0,2942.71,904.22,335.78,,1054.92,732.64
8,RCF Th,RCF That Urea K,0.0,371.35,0.0,0.0,0.0,0.0,0.0,371.35



which plant has minimum sum of Pending acknwoledgment and minimum Remaining stock?
Predicted answer: RCF Trombay, RCF Th


In [1]:
from transformers import pipeline

nlp = pipeline(
    "document-question-answering",
    model="impira/layoutlm-document-qa",
)

print(nlp(
    "https://www.accountingcoach.com/wp-content/uploads/2013/10/income-statement-example@2x.png",
    "What are the 2020 net sales?"
))
# {'score': 0.59147286, 'answer': '$ 3,750', 'start': 19, 'end': 20}

  from .autonotebook import tqdm as notebook_tqdm
model.safetensors:  25%|██▍       | 126M/511M [02:29<07:38, 840kB/s] 


KeyboardInterrupt: 

In [32]:
from lida import Manager

# Load your data (replace with your data source)
data = pd.read_csv("your_data.csv")

# Initialize LIDA manager
manager = Manager()

# Summarize data and define visualization goal
summary = manager.summarize(data)
goal = {"type": "chart", "x": "column1", "y": "column2"}

# Generate visualization
charts = manager.visualize(summary, goal, library="matplotlib")

# Display or save the chart
charts[0].show()  # Display in notebook
charts[0].savefig("chart.png")  # Save as image


ModuleNotFoundError: No module named 'lida'