In [1]:
!pip install transformers
!pip install pandas
!pip install torch torchvision
!pip install sentencepiece
!pip install transformers[torch]


Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.6 MB/s[0m eta [36m0:00:

In [2]:
# Importing the required libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import torch

In [3]:
# Function to run inference for FAQ generation
def generate_faq(model_path, inference_data_path, text_column, output_file):
    # Load the saved model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    tokenizer = T5Tokenizer.from_pretrained("T5-base")

    # Load the inference dataset
    inference_df = pd.read_csv(inference_data_path)
    generated_faqs = []

    # Generate FAQ for each summarized text
    for index, row in inference_df.iterrows():
        input_text = row[text_column]

        # Tokenize the input text and generate an FAQ ID
        inputs = tokenizer("generate: " + input_text, return_tensors="pt", max_length=512, truncation=True)
        with torch.no_grad():  # disable gradient calculation to save memory
            faq_ids = model.generate(inputs.input_ids, num_beams=4, min_length=30, max_length=100, early_stopping=True)

        # Decode the FAQ ID and append to the list of generated FAQs
        generated_faq = tokenizer.decode(faq_ids[0], skip_special_tokens=True)
        generated_faqs.append(generated_faq)

    # Save the generated FAQs to a CSV file
    output_df = pd.DataFrame({
        text_column: inference_df[text_column],
        "Generated_FAQ": generated_faqs
    })
    output_df.to_csv(output_file, index=False)

In [4]:
# Define the paths and column names
model_path = "/content/drive/MyDrive/NLP_Data/Faq/faq_model"  # Replace this with your model path
inference_data_path = "/content/test.csv"  # Replace this with your inference data path
text_column = "summarized_text"
output_file = "/content/generated_faqs.csv"  # Replace this with your desired output file name

In [5]:
# Run the inference
generate_faq(model_path, inference_data_path, text_column, output_file)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
