In [None]:
!pip install transformers torch datasets

In [None]:
!pip install transformers datasets torch

In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict

In [2]:
filename = "/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/RQ4/RQ4_train.csv"
df = pd.read_csv(filename)
df_cleaned = df.dropna()

# Get the original number of rows
original_rows = len(df)


# Get the number of rows after dropping
cleaned_rows = len(df_cleaned)

# Calculate the number of dropped rows
dropped_rows = original_rows - cleaned_rows

# Optionally, reset the index after dropping rows
df_cleaned.reset_index(drop=True, inplace=True)

# Print how many rows were dropped
print(f"Rows dropped: {dropped_rows}")
print(f"Original rows: {original_rows}")
print(f"Cleaned rows: {cleaned_rows}")

Rows dropped: 4
Original rows: 32562
Cleaned rows: 32558


In [3]:
df_cleaned.label.value_counts()

0    26235
1     6323
Name: label, dtype: int64

In [4]:
possible_labels = df_cleaned.label.unique()
possible_labels

array([0, 1])

In [5]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [6]:
label_dict = {'1': 1, '0': 0}
label_dict

{'1': 1, '0': 0}

In [7]:
df_cleaned['LABEL'] = df_cleaned.label.replace(label_dict)
df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['LABEL'] = df_cleaned.label.replace(label_dict)


Unnamed: 0,CCR,response,label,LABEL
0,the iterator based approach was `!isAfter(endD...,"Yes you're right, to handle this I had to roun...",0,0
1,A link back to `HeaderValueOption` would be ap...,Done. I updated the other places in this proto...,0,0
2,You start explaining details of very specific ...,Agree,0,0
3,is this test in here twice?,"Yes, it tests it twice: once using the Beautif...",0,0
4,what is the purpose of the changes in this file?,reverted,0,0


In [8]:
from datasets import load_dataset
dataset = Dataset.from_pandas(df_cleaned)
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [9]:
train_dataset

Dataset({
    features: ['CCR', 'response', 'label', 'LABEL'],
    num_rows: 26046
})

In [10]:
val_dataset

Dataset({
    features: ['CCR', 'response', 'label', 'LABEL'],
    num_rows: 6512
})

In [11]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')

def preprocess_function(examples):
#     # Convert all values to strings
    ccr_texts = [str(ex) for ex in examples['CCR']]
    response_texts = [str(rep) for rep in examples['response']]
    labels = [str(label) for label in examples['LABEL']]

    # Prepare input text by combining CCR comments and responses
    inputs = [
        f"Is this CCR comment confusing or non-confusing with respect to authors Reply? CCR: {ex} Reply: {rep}"
        for ex, rep in zip(ccr_texts, response_texts)
    ]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    
    # Tokenize targets
    labels = tokenizer(labels, max_length=3, padding="max_length", truncation=True, return_tensors="pt")["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100  # Replace padding tokens in labels with -100
    model_inputs["labels"] = labels

    return model_inputs


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/26046 [00:00<?, ? examples/s]

Map:   0%|          | 0/6512 [00:00<?, ? examples/s]

In [13]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-small')

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # evaluation strategy is "epoch"
    save_strategy="epoch",  # set save strategy to "epoch" to match evaluation strategy
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1071,0.1155
2,0.087,0.097748
3,0.0866,0.10117


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=19536, training_loss=0.10992610620534586, metrics={'train_runtime': 1340.4264, 'train_samples_per_second': 58.293, 'train_steps_per_second': 14.574, 'total_flos': 1.4525112256561152e+16, 'train_loss': 0.10992610620534586, 'epoch': 3.0})

In [14]:
model.save_pretrained('/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small')
tokenizer.save_pretrained('/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small')

('/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small/tokenizer_config.json',
 '/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small/special_tokens_map.json',
 '/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small/spiece.model',
 '/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small/added_tokens.json')

In [15]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained('/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small')
tokenizer = T5Tokenizer.from_pretrained('/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/finetuned-flan-t5-small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
def predict_clarity(ccr_comment):
    # Prepare the input for inference
    input_text = f"Is this comment clear or understandable for code authors to address?\n Comment: {ccr_comment}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate prediction
    outputs = model.generate(inputs.input_ids, max_length=10)
    
    # Decode the output
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

In [18]:
# # Example CCR comment for prediction
# new_ccr_comment = "How are you today"
# result = predict_clarity(new_ccr_comment)
# print("Predicted label:", result)

Predicted label: 1


In [28]:
import os


filename = "/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/RQ4/RQ4_test.csv"
df = pd.read_csv(filename)
df_cleaned = df.dropna()

for index,row in df_cleaned.iterrows():
    results = []
    #print("Index: ", index)
    ccr  = row['CCR']
    label = row['label']
    result = predict_clarity(ccr)
    #print(type(label),type(result))
    
    
    if int(label)==int(result):
        print("Index: ", index,":) Matched")
    else:
        print("Index: ", index,":( No Matched")
    results.append({
    'CCR': ccr,
    'response': row['response'],
    'actual': int(label),
    'Prediction': int(result)
    })
    #print(results)
    result_df = pd.DataFrame(results)
    file_path = 'RQ4_results_by_T5_small_r3.csv'
    result_df.to_csv(file_path, mode='a', index=False, header=not os.path.exists(file_path)) 



Index:  0 :( No Matched
Index:  1 :( No Matched
Index:  2 :( No Matched
Index:  3 :( No Matched
Index:  4 :) Matched
Index:  5 :( No Matched
Index:  6 :) Matched
Index:  7 :( No Matched
Index:  8 :( No Matched
Index:  9 :( No Matched
Index:  10 :( No Matched
Index:  11 :( No Matched
Index:  12 :) Matched
Index:  13 :( No Matched
Index:  14 :( No Matched
Index:  15 :) Matched
Index:  16 :) Matched
Index:  17 :) Matched
Index:  18 :( No Matched
Index:  19 :) Matched
Index:  20 :( No Matched
Index:  21 :( No Matched
Index:  22 :) Matched
Index:  23 :) Matched
Index:  24 :( No Matched
Index:  25 :( No Matched
Index:  26 :) Matched
Index:  27 :( No Matched
Index:  28 :( No Matched
Index:  29 :( No Matched
Index:  30 :( No Matched
Index:  31 :( No Matched
Index:  32 :( No Matched
Index:  33 :) Matched
Index:  34 :( No Matched
Index:  35 :( No Matched
Index:  36 :) Matched
Index:  37 :( No Matched
Index:  38 :( No Matched
Index:  39 :( No Matched
Index:  40 :( No Matched
Index:  41 :) Matched

Index:  343 :( No Matched
Index:  344 :( No Matched
Index:  345 :) Matched
Index:  346 :) Matched
Index:  347 :) Matched
Index:  348 :) Matched
Index:  349 :) Matched
Index:  350 :) Matched
Index:  351 :) Matched
Index:  352 :) Matched
Index:  353 :( No Matched
Index:  354 :( No Matched
Index:  355 :) Matched
Index:  356 :( No Matched
Index:  357 :( No Matched
Index:  358 :) Matched
Index:  359 :) Matched
Index:  360 :) Matched
Index:  361 :( No Matched
Index:  362 :) Matched
Index:  363 :) Matched
Index:  364 :) Matched
Index:  365 :) Matched
Index:  366 :) Matched
Index:  367 :) Matched
Index:  368 :( No Matched
Index:  369 :) Matched
Index:  370 :( No Matched
Index:  371 :( No Matched
Index:  372 :) Matched
Index:  373 :( No Matched
Index:  374 :) Matched
Index:  375 :) Matched
Index:  376 :( No Matched
Index:  377 :( No Matched
Index:  378 :( No Matched
Index:  379 :( No Matched
Index:  380 :( No Matched
Index:  381 :) Matched
Index:  382 :) Matched
Index:  383 :( No Matched
Index:

Index:  677 :) Matched
Index:  678 :( No Matched
Index:  679 :) Matched
Index:  680 :( No Matched
Index:  681 :( No Matched
Index:  682 :) Matched
Index:  683 :( No Matched
Index:  684 :( No Matched
Index:  685 :( No Matched
Index:  686 :( No Matched
Index:  687 :) Matched
Index:  688 :( No Matched
Index:  689 :) Matched
Index:  690 :) Matched
Index:  691 :( No Matched
Index:  692 :( No Matched
Index:  693 :( No Matched
Index:  694 :( No Matched
Index:  695 :( No Matched
Index:  696 :) Matched
Index:  697 :) Matched
Index:  698 :( No Matched
Index:  699 :) Matched
Index:  700 :) Matched
Index:  701 :) Matched
Index:  702 :) Matched
Index:  703 :) Matched
Index:  704 :) Matched
Index:  705 :( No Matched
Index:  706 :( No Matched
Index:  707 :) Matched
Index:  708 :( No Matched
Index:  709 :( No Matched
Index:  710 :) Matched
Index:  711 :) Matched
Index:  712 :( No Matched
Index:  713 :) Matched
Index:  714 :( No Matched
Index:  715 :) Matched
Index:  716 :) Matched
Index:  717 :) Match

Index:  1016 :( No Matched
Index:  1017 :( No Matched
Index:  1018 :) Matched
Index:  1019 :) Matched
Index:  1020 :) Matched
Index:  1021 :) Matched
Index:  1022 :) Matched
Index:  1023 :( No Matched
Index:  1024 :) Matched
Index:  1025 :( No Matched
Index:  1026 :( No Matched
Index:  1027 :) Matched
Index:  1028 :) Matched
Index:  1029 :) Matched
Index:  1030 :( No Matched
Index:  1031 :) Matched
Index:  1032 :) Matched
Index:  1033 :( No Matched
Index:  1034 :) Matched
Index:  1035 :) Matched
Index:  1036 :( No Matched
Index:  1037 :( No Matched
Index:  1038 :( No Matched
Index:  1039 :) Matched
Index:  1040 :) Matched
Index:  1041 :( No Matched
Index:  1042 :( No Matched
Index:  1043 :( No Matched
Index:  1044 :) Matched
Index:  1045 :) Matched
Index:  1046 :( No Matched
Index:  1047 :( No Matched
Index:  1048 :( No Matched
Index:  1049 :) Matched
Index:  1050 :( No Matched
Index:  1051 :) Matched
Index:  1052 :( No Matched
Index:  1053 :( No Matched
Index:  1054 :( No Matched
Inde

Index:  1336 :) Matched
Index:  1337 :) Matched
Index:  1338 :) Matched
Index:  1339 :( No Matched
Index:  1340 :( No Matched
Index:  1341 :) Matched
Index:  1342 :( No Matched
Index:  1343 :( No Matched
Index:  1344 :) Matched
Index:  1345 :( No Matched
Index:  1346 :) Matched
Index:  1347 :( No Matched
Index:  1348 :) Matched
Index:  1349 :) Matched
Index:  1350 :( No Matched
Index:  1351 :) Matched
Index:  1352 :) Matched
Index:  1353 :) Matched
Index:  1354 :( No Matched
Index:  1355 :) Matched
Index:  1356 :( No Matched
Index:  1357 :) Matched
Index:  1358 :) Matched
Index:  1359 :( No Matched
Index:  1360 :) Matched
Index:  1361 :) Matched
Index:  1362 :) Matched
Index:  1363 :) Matched
Index:  1364 :( No Matched
Index:  1365 :( No Matched
Index:  1366 :( No Matched
Index:  1367 :( No Matched
Index:  1368 :( No Matched
Index:  1369 :) Matched
Index:  1370 :) Matched
Index:  1371 :( No Matched
Index:  1372 :( No Matched
Index:  1373 :) Matched
Index:  1374 :) Matched
Index:  1375 

Index:  1659 :( No Matched
Index:  1660 :) Matched
Index:  1661 :( No Matched
Index:  1662 :( No Matched
Index:  1663 :( No Matched
Index:  1664 :) Matched
Index:  1665 :( No Matched
Index:  1666 :( No Matched
Index:  1667 :) Matched
Index:  1668 :( No Matched
Index:  1669 :( No Matched
Index:  1670 :( No Matched
Index:  1671 :) Matched
Index:  1672 :) Matched
Index:  1673 :( No Matched
Index:  1674 :( No Matched
Index:  1675 :( No Matched
Index:  1676 :) Matched
Index:  1677 :( No Matched
Index:  1678 :) Matched
Index:  1679 :) Matched
Index:  1680 :( No Matched
Index:  1681 :) Matched
Index:  1682 :) Matched
Index:  1683 :( No Matched
Index:  1684 :( No Matched
Index:  1685 :) Matched
Index:  1686 :( No Matched
Index:  1687 :( No Matched
Index:  1688 :) Matched
Index:  1689 :( No Matched
Index:  1690 :( No Matched
Index:  1691 :) Matched
Index:  1692 :( No Matched
Index:  1693 :) Matched
Index:  1694 :( No Matched
Index:  1695 :( No Matched
Index:  1696 :( No Matched
Index:  1697 :( 

Index:  1979 :( No Matched
Index:  1980 :) Matched
Index:  1981 :) Matched
Index:  1982 :) Matched
Index:  1983 :( No Matched
Index:  1984 :) Matched
Index:  1985 :( No Matched
Index:  1986 :( No Matched
Index:  1987 :( No Matched
Index:  1988 :( No Matched
Index:  1989 :( No Matched
Index:  1990 :( No Matched
Index:  1991 :( No Matched
Index:  1992 :) Matched
Index:  1993 :( No Matched
Index:  1994 :( No Matched
Index:  1995 :) Matched
Index:  1996 :( No Matched
Index:  1997 :) Matched
Index:  1998 :( No Matched
Index:  1999 :) Matched
Index:  2000 :) Matched
Index:  2001 :( No Matched
Index:  2002 :( No Matched
Index:  2003 :( No Matched
Index:  2004 :) Matched
Index:  2005 :( No Matched
Index:  2006 :( No Matched
Index:  2007 :) Matched
Index:  2008 :( No Matched
Index:  2009 :( No Matched
Index:  2010 :) Matched
Index:  2011 :) Matched
Index:  2012 :( No Matched
Index:  2013 :( No Matched
Index:  2014 :( No Matched
Index:  2015 :) Matched
Index:  2016 :( No Matched
Index:  2017 :( 

Index:  2303 :) Matched
Index:  2304 :( No Matched
Index:  2305 :( No Matched
Index:  2306 :) Matched
Index:  2307 :) Matched
Index:  2308 :( No Matched
Index:  2309 :( No Matched
Index:  2310 :) Matched
Index:  2311 :( No Matched
Index:  2312 :( No Matched
Index:  2313 :( No Matched
Index:  2314 :) Matched
Index:  2315 :( No Matched
Index:  2316 :( No Matched
Index:  2317 :( No Matched
Index:  2318 :) Matched
Index:  2319 :) Matched
Index:  2320 :) Matched
Index:  2321 :( No Matched
Index:  2322 :( No Matched
Index:  2323 :) Matched
Index:  2324 :) Matched
Index:  2325 :( No Matched
Index:  2326 :) Matched
Index:  2327 :( No Matched
Index:  2328 :) Matched
Index:  2329 :) Matched
Index:  2330 :( No Matched
Index:  2331 :( No Matched
Index:  2332 :( No Matched
Index:  2333 :) Matched
Index:  2334 :) Matched
Index:  2335 :( No Matched
Index:  2336 :( No Matched
Index:  2337 :( No Matched
Index:  2338 :) Matched
Index:  2339 :( No Matched
Index:  2340 :( No Matched
Index:  2341 :( No Mat

Index:  2623 :) Matched
Index:  2624 :( No Matched
Index:  2625 :) Matched
Index:  2626 :( No Matched
Index:  2627 :( No Matched
Index:  2628 :( No Matched
Index:  2629 :( No Matched
Index:  2630 :( No Matched
Index:  2631 :) Matched
Index:  2632 :) Matched
Index:  2633 :( No Matched
Index:  2634 :( No Matched
Index:  2635 :( No Matched
Index:  2636 :) Matched
Index:  2637 :( No Matched
Index:  2638 :( No Matched
Index:  2639 :) Matched
Index:  2640 :) Matched
Index:  2641 :) Matched
Index:  2642 :) Matched
Index:  2643 :) Matched
Index:  2644 :( No Matched
Index:  2645 :( No Matched
Index:  2646 :( No Matched
Index:  2647 :) Matched
Index:  2648 :) Matched
Index:  2649 :( No Matched
Index:  2650 :( No Matched
Index:  2651 :( No Matched
Index:  2652 :( No Matched
Index:  2653 :( No Matched
Index:  2654 :( No Matched
Index:  2655 :( No Matched
Index:  2656 :( No Matched
Index:  2657 :) Matched
Index:  2658 :) Matched
Index:  2659 :) Matched
Index:  2660 :) Matched
Index:  2661 :) Matche

Index:  2943 :) Matched
Index:  2944 :( No Matched
Index:  2945 :( No Matched
Index:  2946 :) Matched
Index:  2947 :( No Matched
Index:  2948 :( No Matched
Index:  2949 :) Matched
Index:  2950 :( No Matched
Index:  2951 :) Matched
Index:  2952 :( No Matched
Index:  2953 :) Matched
Index:  2954 :) Matched
Index:  2955 :( No Matched
Index:  2956 :) Matched
Index:  2957 :( No Matched
Index:  2958 :( No Matched
Index:  2959 :( No Matched
Index:  2960 :( No Matched
Index:  2961 :( No Matched
Index:  2962 :) Matched
Index:  2963 :( No Matched
Index:  2964 :) Matched
Index:  2965 :( No Matched
Index:  2966 :( No Matched
Index:  2967 :) Matched
Index:  2968 :) Matched
Index:  2969 :) Matched
Index:  2970 :( No Matched
Index:  2971 :( No Matched
Index:  2972 :( No Matched
Index:  2973 :( No Matched
Index:  2974 :) Matched
Index:  2975 :) Matched
Index:  2976 :( No Matched
Index:  2977 :( No Matched
Index:  2978 :( No Matched
Index:  2979 :( No Matched
Index:  2980 :) Matched
Index:  2981 :( No 

Index:  3264 :) Matched
Index:  3265 :( No Matched
Index:  3266 :) Matched
Index:  3267 :) Matched
Index:  3268 :( No Matched
Index:  3269 :( No Matched
Index:  3270 :( No Matched
Index:  3271 :) Matched
Index:  3272 :( No Matched
Index:  3273 :( No Matched
Index:  3274 :( No Matched
Index:  3275 :) Matched
Index:  3276 :( No Matched
Index:  3277 :( No Matched
Index:  3278 :( No Matched
Index:  3279 :) Matched
Index:  3280 :( No Matched
Index:  3281 :( No Matched
Index:  3282 :( No Matched
Index:  3283 :) Matched
Index:  3284 :) Matched
Index:  3285 :( No Matched
Index:  3286 :) Matched
Index:  3287 :( No Matched
Index:  3288 :) Matched
Index:  3289 :( No Matched
Index:  3290 :) Matched
Index:  3291 :) Matched
Index:  3292 :( No Matched
Index:  3293 :( No Matched
Index:  3294 :) Matched
Index:  3295 :) Matched
Index:  3296 :) Matched
Index:  3297 :) Matched
Index:  3298 :( No Matched
Index:  3299 :( No Matched
Index:  3300 :( No Matched
Index:  3301 :( No Matched
Index:  3302 :( No Mat

Index:  3588 :) Matched
Index:  3589 :( No Matched
Index:  3590 :) Matched
Index:  3591 :( No Matched
Index:  3592 :( No Matched
Index:  3593 :( No Matched
Index:  3594 :( No Matched
Index:  3595 :) Matched
Index:  3596 :( No Matched
Index:  3597 :( No Matched
Index:  3598 :) Matched
Index:  3599 :) Matched
Index:  3600 :( No Matched
Index:  3601 :( No Matched
Index:  3602 :( No Matched
Index:  3603 :( No Matched
Index:  3604 :( No Matched
Index:  3605 :) Matched
Index:  3606 :) Matched
Index:  3607 :) Matched
Index:  3608 :( No Matched
Index:  3609 :( No Matched
Index:  3610 :) Matched
Index:  3611 :( No Matched
Index:  3612 :( No Matched
Index:  3613 :) Matched
Index:  3614 :) Matched
Index:  3615 :( No Matched
Index:  3616 :( No Matched
Index:  3617 :( No Matched


In [34]:
filename = "/u1/mdr614/On the compleness of review comments/Notebook_on the compleness/RQ4/RQ4_test.csv"
df = pd.read_csv(filename)
df_cleaned = df.dropna()

print("Are the following comments clear and understandable? Give answer serially with reasoning.\n")

for index,row in df_cleaned.iterrows():
    if index>=301 and index<=350:
        i = index+2
        ccr  = row['CCR']
        label = row['label']
        
        if int(label) == 0:
            print('Index ',i,': ',ccr)
    


Are the following comments clear and understandable? Give answer serially with reasoning.

Index  303 :  "sqlBinds.getParamsMap()" can be replaced with  "params.entrySet()" 

since there is alreay a  params.

Index  304 :  pull the number into a constant. Know that I automatically -1 all inline constants in production code and save time all round.
Index  305 :  These methods are actually only used once and contain only a single line. Why not use them directly in place?
Index  306 :  How many of these modifications are known necessary for this particular IT vs just things that have worked in the past? If it's possible to use a stock ES docker image, that'd be awesome. I'm guessing it's not possible or you would have used that originally :)

Are these still necessary if we restrict it to a super small data set? 

Index  307 :  can we call this `getNodeLockId()` pls
Index  309 :  The interface currently declares that it throws `IOException` (I asked a question about it).
If it stays then 

In [None]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # Load the fine-tuned model and tokenizer
# model_path = "./flan_t5_large_finetuned"
# model = T5ForConditionalGeneration.from_pretrained(model_path)
# tokenizer = T5Tokenizer.from_pretrained(model_path)

# # Set model to evaluation mode
# model.eval()

# # Inference function
# def predict_clarity(ccr_comment):
#     input_text = f"Is this comment understandable? CCR: {ccr_comment}"
#     inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    
#     # Generate prediction
#     outputs = model.generate(inputs.input_ids, max_length=10)
    
#     # Decode the output
#     prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return prediction

# # Example usage with a new CCR comment
# new_ccr_comment = "Please ensure that this function is optimized."
# result = predict_clarity(new_ccr_comment)
# print("Predicted label:", result)
