In [1]:
import csv
from collections import defaultdict
from transformers import pipeline
from transformers import T5ForConditionalGeneration, T5Tokenizer

def read_csv_data(file_path):
    data = defaultdict(list)
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            group_id = row['Group ID of project being reviewed']
            data[group_id].append(row)
    return data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def average_numeric_features(data):
    averages = {}
    numeric_fields = [
        'How clearly the problem is formulated? Select in a range from 1 to 5.',
        'How appropriate is the dataset choice? Select in range from 1 to 5.',
        'How feasible are the proposed methods? Select in a range from 1 to 5.',
        'How many methods are described? Select between 0 to 3. If more than 3 is described, select 3.',
        'Are the three methods properly described? Select in a range from 1 to 5. Select in a range between 1 to 5.',
        'How appropriate is the proposed evaluation metrics to evaluate the methods? Select in a range between 0 to 5. Select 0 if evaluation criteria is not mentioned.',
        'How many related works are discussed? Select in a range from 0 to 2. If more than 2 related works are discussed, select 2.',
        'How appropriate is the choice of related works or how relevant they are to the main problem? Select in a range between 1 to 5.'
    ]
    
    for group_id, reviews in data.items():
        group_averages = {field: 0 for field in numeric_fields}
        for review in reviews:
            for field in numeric_fields:
                group_averages[field] += int(review[field])
        group_averages = {field: value / len(reviews) for field, value in group_averages.items()}
        averages[group_id] = group_averages
    
    return averages


In [3]:
def initialize_t5_model(model_name="t5-large"):
    """
    Initializes the T5 model and tokenizer with the specified model name.
    """
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    return tokenizer, model

def summarize_text(text, tokenizer, model, max_length=60, min_length=10):
    """
    Generates a summary for the provided text using the T5 model.
    """
    # Prefixing the text with "summarize: " to indicate summarization task
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generating the summary
    summary_ids = model.generate(input_ids, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

def summarize_open_questions(data):
    summaries = {}
    open_questions = [
        'Provide brief justification on your rating for "How clearly the problem is formulated? "',
        'Provide brief justification on your rating for "How appropriate is the dataset choice? "',
        'Provide brief justification on your rating for "How feasible are the proposed methods?"',
        'Provide brief justification on your rating for "Are the three methods properly described?"',
        'How appropriate are the proposed methods for the problem being solved? Briefly explain your decision for each model.',
        'Provide brief justification on your rating for "How appropriate is the proposed evaluation metrics to evaluate the methods?"',
        'Provide brief justification on your rating for "How appropriate is the choice of related works or how relevant they are to the main problem?"',
        'Briefly mention the overall strengths of the proposal.',
        'Briefly mention the overall weaknesses of the proposal.'
    ]
    
    for group_id, reviews in data.items():
        group_summaries = {}
        for question in open_questions:
            # Concatenate all responses for the question
            concatenated_responses = " ".join(review[question] for review in reviews if review[question].strip())
            
            # Summarize the concatenated responses if they are not empty
            if concatenated_responses:
                summarized_response = summarize_text(concatenated_responses, tokenizer, model)
                print('Before: ', concatenated_responses)
                group_summaries[question] = summarized_response
                print('After: ', summarized_response)
            else:
                group_summaries[question] = "No responses provided."
                
        summaries[group_id] = group_summaries
    
    return summaries

In [4]:
def export_results(file_path, averages, summaries):
    with open(file_path, 'w', newline='') as csvfile:
        fieldnames = ['Group ID of project being reviewed'] + list(averages[list(averages.keys())[0]].keys()) + list(summaries[list(summaries.keys())[0]].keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for group_id in averages:
            row = {'Group ID of project being reviewed': group_id, **averages[group_id], **summaries[group_id]}
            writer.writerow(row)
            
tokenizer, model = initialize_t5_model()

# Main execution
file_path = 'ProjReview.csv'
output_path = 'ProjReview_results.csv'
data = read_csv_data(file_path)
averages = average_numeric_features(data)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
summaries = summarize_open_questions(data)
export_results(output_path, averages, summaries)

Before:  Title and the first sentence of the first paragraph all reveals the studied problem clearly. The purpose for training the model for specific dataset and the real-life significance for doing so is clearly identified and very convincing. The problem is quite easy understanding in its title and first paragraph. The problem is to use x-ray images for disease prediction. You explained how you plan to solve the problem, but the problem itself was not stated. The proposal talks about the model and its potential uses if successful, but doesn't clearly address current / technical problem(s) that are being faced.
After:  the problem is to use x-ray images for disease prediction. you explained how you plan to solve the problem, but the problem itself was not stated.
Before:  There have been a lot of related works using this data, and I think it is reliable and workable. The data set chosen is very match with the purpose of the paper and has been verified with other research work. This so