In [1]:
!pip install pandas rouge-score bert-score scikit-learn

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=3727ef8e5f9abdde34a33d2c4bcae9244a1a545e83b07a8c8de51fbd382297df
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [2]:
!pip install rouge bert-score

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [4]:
import pandas as pd

#Extracting original diseases so that the results from the models can be verified for proper inference
#Then it will be used as the ground truth for calculating the evaluation metrices
test_dataset = pd.read_csv('/kaggle/input/data-preparation-symptom-disease-classification/test_primary.csv')
original_diseases = test_dataset['disease'].tolist()

In [5]:
def validate_test_dataset(new_diseases):
    return new_diseases == original_diseases


In [6]:
#dictionary to store evaluation metrics for all models
llms_data = {}

In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, fbeta_score, confusion_matrix
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from rouge import Rouge
from bert_score import score as bert_score
from evaluate import load
import evaluate


def t2t_evaluate(model_name, model_type, csv_file_path):
    
    # Load the CSV file
    df = pd.read_csv(csv_file_path)

    # Extract actual problems and predictions as lists
    actual_problems = df['output'].tolist()
    predictions = df['prediction'].tolist()
    
    def custom_strip(s, chars):
        chars_set = set(chars)  # Convert list of chars to a set for faster membership checking

        # Strip characters from the start
        while s and s[0] in chars_set:
            s = s[1:]

        # Strip characters from the end
        while s and s[-1] in chars_set:
            s = s[:-1]

        return s

    # Function to segregate and extract health problems
    def extract_health_problems(predictions):
        extracted_problems = []
        followed_instruction_count = 0
        flouted_instruction_count = 0

        for pred in predictions:
            health_problem = pred
            if pred.startswith("Identified Health Problem : "):
                # Followed instruction
                health_problem = pred[len("Identified Health Problem : "):].strip()
                if '\n' not in health_problem or '\n' in health_problem and all(char in [' ', '\n'] for char in health_problem.split('\n')[1:]):
                    # Count as followed instruction
                    followed_instruction_count += 1
                else:
                    # Count as flouted instruction
                    flouted_instruction_count += 1
            elif pred.startswith("Identified Health Problem: "):
                # Followed instruction
                health_problem = pred[len("Identified Health Problem: "):].strip()
                if '\n' not in health_problem or '\n' in health_problem and all(char in [' ', '\n'] for char in health_problem.split('\n')[1:]):
                    # Count as followed instruction
                    followed_instruction_count += 1
                else:
                    # Count as flouted instruction
                    flouted_instruction_count += 1
            else:
                # Flouted instruction
                flouted_instruction_count += 1
            health_problem = health_problem.split('\n')[0].split('#')[0].split('.')[0]

            extracted_problems.append(health_problem)

        return extracted_problems, followed_instruction_count, flouted_instruction_count
    actual_problems, _, _ = extract_health_problems(actual_problems)
    predictions, followed, flouted = extract_health_problems(predictions)
    
    if (not validate_test_dataset(actual_problems)):
        return "failure"
    

    
    # Suppress warnings for calculate_rouge_score and calculate_bert_score functions
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)
        
        def calculate_rouge_score(generated_texts, reference_texts):
            rouge = evaluate.load('rouge')
            results = rouge.compute(predictions=generated_texts, references=reference_texts)
            #rouge_scores = rouge.get_scores(generated_texts, reference_texts)
            #return rouge_scores
            return results

        def calculate_bert_score(generated_texts, reference_texts):
            P, R, F1 = bert_score(generated_texts, reference_texts, lang='en', verbose=False)
            return {
                'precision': P.tolist(),
                'recall': R.tolist(),
                'f1_score': F1.tolist()
            }
    
    # Define a function to calculate classification metrics
    def calculate_classification_metrics(actual, predicted):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
            accuracy = accuracy_score(actual, predicted)
            precision = precision_score(actual, predicted, average='weighted', zero_division=0)
            recall = recall_score(actual, predicted, average='weighted', zero_division=0)
            f1 = f1_score(actual, predicted, average='weighted', zero_division=0)
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
    
    # Calculate classification metrics
    classification_metrics = calculate_classification_metrics(actual_problems, predictions)


    # Text-to-Text Metrics
    # Calculate ROUGE scores
    rouge_scores = calculate_rouge_score(predictions, actual_problems)

    # Calculate BERT scores
    bert_scores = calculate_bert_score(predictions, actual_problems)
    


    # Calculate average scores
#     avg_rouge_precision_1 = np.mean([score['rouge-1']['p'] for score in rouge_scores])
#     avg_rouge_recall_1 = np.mean([score['rouge-1']['r'] for score in rouge_scores])
#     avg_rouge_f1_1 = np.mean([score['rouge-1']['f'] for score in rouge_scores])

#     avg_rouge_precision_2 = np.mean([score['rouge-2']['p'] for score in rouge_scores])
#     avg_rouge_recall_2 = np.mean([score['rouge-2']['r'] for score in rouge_scores])
#     avg_rouge_f1_2 = np.mean([score['rouge-2']['f'] for score in rouge_scores])

#     avg_rouge_precision_l = np.mean([score['rouge-l']['p'] for score in rouge_scores])
#     avg_rouge_recall_l = np.mean([score['rouge-l']['r'] for score in rouge_scores])
#     avg_rouge_f1_l = np.mean([score['rouge-l']['f'] for score in rouge_scores])

    avg_bert_precision = np.mean(bert_scores['precision'])
    avg_bert_recall = np.mean(bert_scores['recall'])
    avg_bert_f1 = np.mean(bert_scores['f1_score'])
    
    
    print("\nModel Name : ", model_name)
    print("Model Type : ", model_type)
    print("")
    val = followed*1.0/(followed + flouted)
    print("Valid Instruction Following Predictions Score : ", f'{val:.3f}')
    print("")
    
    print("Classification Metrics:\n")
    print(f"Accuracy: {classification_metrics['accuracy']:.3f}")
    print(f"Precision: {classification_metrics['precision']:.3f}")
    print(f"Recall: {classification_metrics['recall']:.3f}")
    print(f"F1 Score: {classification_metrics['f1_score']:.3f}")
    print("")
    
    
    print("Text to Text Metrics:\n")
    print("ROUGE Scores:")
    print(f"ROUGE-1: {rouge_scores['rouge1']:.3f}")
    print(f"ROUGE-2: {rouge_scores['rouge2']:.3f}")
    print(f"ROUGE-L: {rouge_scores['rougeL']:.3f}")
    print(f"ROUGE-Lsum: {rouge_scores['rougeLsum']:.3f}")
    print("")
    print(f"Average BERT Precision: {avg_bert_precision:.3f}")
    print(f"Average BERT Recall: {avg_bert_recall:.3f}")
    print(f"Average BERT F1 Score: {avg_bert_f1:.3f}")
    
    print("\n\n-------------------------------------------------------------------------------\n")
    
    print("\nmodel-name : ", model_name)
    print("model-type : ",model_type)
    print("")
    val = followed*1.0/(followed + flouted)
    print("valid-instruction-rating : ", f'{val:.3f}')
    print("")
    
    print("classification-metrics:\n")
    print(f"accuracy: {classification_metrics['accuracy']:.3f}")
    print(f"precision: {classification_metrics['precision']:.3f}")
    print(f"recall: {classification_metrics['recall']:.3f}")
    print(f"f1-score: {classification_metrics['f1_score']:.3f}")
    print("")
    
    
    print("text-to-text-metrics:\n")
    print("rouge-scores:")
    print(f"rouge-1: {rouge_scores['rouge1']:.3f}")
    print(f"rouge-2: {rouge_scores['rouge2']:.3f}")
    print(f"rouge-L: {rouge_scores['rougeL']:.3f}")
    print(f"rouge-lsum: {rouge_scores['rougeLsum']:.3f}")
    print("")
    print(f"bert-precision: {avg_bert_precision:.3f}")
    print(f"bert-recall: {avg_bert_recall:.3f}")
    print(f"bert-f1-score: {avg_bert_f1:.3f}")
    
# Initialize the model_name key if it doesn't exist
    if model_name not in llms_data:
        llms_data[model_name] = {}

    # Update the llms_data dictionary with rounded values directly
    llms_data[model_name].update({
        model_type: {
            "valid-instruction-rating": round(val, 3),
            "classification-metrics": {
                "accuracy": round(classification_metrics["accuracy"], 3),
                "precision": round(classification_metrics["precision"], 3),
                "recall": round(classification_metrics["recall"], 3),
                "f1-score": round(classification_metrics["f1_score"], 3)
            },
            "text-to-text-metrics": {
                "rouge-scores": {
                    "rouge-1": round(rouge_scores["rouge1"], 3),
                    "rouge-2": round(rouge_scores["rouge2"], 3),
                    "rouge-L": round(rouge_scores["rougeL"], 3),
                    "rouge-lsum": round(rouge_scores["rougeLsum"], 3)
                },
                "bert-precision": round(avg_bert_precision, 3),
                "bert-recall": round(avg_bert_recall, 3),
                "bert-f1-score": round(avg_bert_f1, 3)
            }
        }
    })

    # Print the updated dictionary
    print("\nUpdated llms_data dictionary:")
    return "success"

2024-06-15 08:29:52.783168: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-15 08:29:52.783285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-15 08:29:52.917651: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
t2t_evaluate(model_name='Qwen2-7B-Instruct'.lower(),model_type = 'pretrained', csv_file_path='/kaggle/input/dtest-with-predictions-qwen2-7b-instruct-pretrain/results/predictions-Qwen2-7B-Instruct-pretrained-output.csv')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  qwen2-7b-instruct
Model Type :  pretrained

Valid Instruction Following Predictions Score :  0.997

Classification Metrics:

Accuracy: 0.050
Precision: 0.253
Recall: 0.050
F1 Score: 0.064

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.200
ROUGE-2: 0.074
ROUGE-L: 0.200
ROUGE-Lsum: 0.199

Average BERT Precision: 0.828
Average BERT Recall: 0.858
Average BERT F1 Score: 0.842


-------------------------------------------------------------------------------


model-name :  qwen2-7b-instruct
model-type :  pretrained

valid-instruction-rating :  0.997

classification-metrics:

accuracy: 0.050
precision: 0.253
recall: 0.050
f1-score: 0.064

text-to-text-metrics:

rouge-scores:
rouge-1: 0.200
rouge-2: 0.074
rouge-L: 0.200
rouge-lsum: 0.199

bert-precision: 0.828
bert-recall: 0.858
bert-f1-score: 0.842

Updated llms_data dictionary:


'success'

In [9]:
t2t_evaluate(model_name='Qwen2-7B-Instruct'.lower(),model_type = 'fine-tuned', csv_file_path='/kaggle/input/dtest-with-predictions-qwen2-7b-instruct-fine-tune/results/predictions-Qwen2-7B-Instruct-fine-tuned-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  qwen2-7b-instruct
Model Type :  fine-tuned

Valid Instruction Following Predictions Score :  0.998

Classification Metrics:

Accuracy: 0.738
Precision: 0.931
Recall: 0.738
F1 Score: 0.778

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.870
ROUGE-2: 0.412
ROUGE-L: 0.870
ROUGE-Lsum: 0.871

Average BERT Precision: 0.962
Average BERT Recall: 0.978
Average BERT F1 Score: 0.970


-------------------------------------------------------------------------------


model-name :  qwen2-7b-instruct
model-type :  fine-tuned

valid-instruction-rating :  0.998

classification-metrics:

accuracy: 0.738
precision: 0.931
recall: 0.738
f1-score: 0.778

text-to-text-metrics:

rouge-scores:
rouge-1: 0.870
rouge-2: 0.412
rouge-L: 0.870
rouge-lsum: 0.871

bert-precision: 0.962
bert-recall: 0.978
bert-f1-score: 0.970

Updated llms_data dictionary:


'success'

In [10]:
t2t_evaluate(model_name='Gemma-7B-Instruct'.lower(),model_type = 'pretrained', csv_file_path='/kaggle/input/dtest-with-predictions-gemma-7b-it-pretrain/results/predictions-gemma-7b-it-pretrained-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  gemma-7b-instruct
Model Type :  pretrained

Valid Instruction Following Predictions Score :  0.134

Classification Metrics:

Accuracy: 0.057
Precision: 0.191
Recall: 0.057
F1 Score: 0.074

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.162
ROUGE-2: 0.066
ROUGE-L: 0.163
ROUGE-Lsum: 0.162

Average BERT Precision: 0.838
Average BERT Recall: 0.842
Average BERT F1 Score: 0.840


-------------------------------------------------------------------------------


model-name :  gemma-7b-instruct
model-type :  pretrained

valid-instruction-rating :  0.134

classification-metrics:

accuracy: 0.057
precision: 0.191
recall: 0.057
f1-score: 0.074

text-to-text-metrics:

rouge-scores:
rouge-1: 0.162
rouge-2: 0.066
rouge-L: 0.163
rouge-lsum: 0.162

bert-precision: 0.838
bert-recall: 0.842
bert-f1-score: 0.840

Updated llms_data dictionary:


'success'

In [11]:
t2t_evaluate(model_name='Gemma-7B-Instruct'.lower(),model_type = 'fine-tuned', csv_file_path='/kaggle/input/dtest-with-predictions-gemma-7b-instruct-fine-tune/results/predictions-gemma-7b-instruct-fine-tuned-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  gemma-7b-instruct
Model Type :  fine-tuned

Valid Instruction Following Predictions Score :  0.974

Classification Metrics:

Accuracy: 0.078
Precision: 0.288
Recall: 0.078
F1 Score: 0.101

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.461
ROUGE-2: 0.201
ROUGE-L: 0.462
ROUGE-Lsum: 0.461

Average BERT Precision: 0.860
Average BERT Recall: 0.933
Average BERT F1 Score: 0.895


-------------------------------------------------------------------------------


model-name :  gemma-7b-instruct
model-type :  fine-tuned

valid-instruction-rating :  0.974

classification-metrics:

accuracy: 0.078
precision: 0.288
recall: 0.078
f1-score: 0.101

text-to-text-metrics:

rouge-scores:
rouge-1: 0.461
rouge-2: 0.201
rouge-L: 0.462
rouge-lsum: 0.461

bert-precision: 0.860
bert-recall: 0.933
bert-f1-score: 0.895

Updated llms_data dictionary:


'success'

In [12]:
t2t_evaluate(model_name='Mistral-7B-Instruct-V0.2'.lower(),model_type = 'pretrained', csv_file_path='/kaggle/input/dtest-with-predictions-mistral-7b-it-v2-pretrained/results/predictions-mistral-7b-instruct-v0.2-pretrained-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  mistral-7b-instruct-v0.2
Model Type :  pretrained

Valid Instruction Following Predictions Score :  0.982

Classification Metrics:

Accuracy: 0.057
Precision: 0.192
Recall: 0.057
F1 Score: 0.065

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.154
ROUGE-2: 0.065
ROUGE-L: 0.155
ROUGE-Lsum: 0.155

Average BERT Precision: 0.835
Average BERT Recall: 0.842
Average BERT F1 Score: 0.838


-------------------------------------------------------------------------------


model-name :  mistral-7b-instruct-v0.2
model-type :  pretrained

valid-instruction-rating :  0.982

classification-metrics:

accuracy: 0.057
precision: 0.192
recall: 0.057
f1-score: 0.065

text-to-text-metrics:

rouge-scores:
rouge-1: 0.154
rouge-2: 0.065
rouge-L: 0.155
rouge-lsum: 0.155

bert-precision: 0.835
bert-recall: 0.842
bert-f1-score: 0.838

Updated llms_data dictionary:


'success'

In [13]:
t2t_evaluate(model_name='Mistral-7B-Instruct-V0.2'.lower(),model_type = 'fine-tuned', csv_file_path='/kaggle/input/dtest-with-predictions-mistral-7b-it-v2-fine-tuned/results/predictions-mistral-7b-instruct-v0.2-fine-tuned-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  mistral-7b-instruct-v0.2
Model Type :  fine-tuned

Valid Instruction Following Predictions Score :  0.437

Classification Metrics:

Accuracy: 0.383
Precision: 0.482
Recall: 0.383
F1 Score: 0.414

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.715
ROUGE-2: 0.365
ROUGE-L: 0.715
ROUGE-Lsum: 0.715

Average BERT Precision: 0.921
Average BERT Recall: 0.963
Average BERT F1 Score: 0.941


-------------------------------------------------------------------------------


model-name :  mistral-7b-instruct-v0.2
model-type :  fine-tuned

valid-instruction-rating :  0.437

classification-metrics:

accuracy: 0.383
precision: 0.482
recall: 0.383
f1-score: 0.414

text-to-text-metrics:

rouge-scores:
rouge-1: 0.715
rouge-2: 0.365
rouge-L: 0.715
rouge-lsum: 0.715

bert-precision: 0.921
bert-recall: 0.963
bert-f1-score: 0.941

Updated llms_data dictionary:


'success'

In [14]:
t2t_evaluate(model_name='Phi-3-Medium-4K-Instruct'.lower(),model_type = 'pretrained', csv_file_path='/kaggle/input/dtest-with-predictions-phi-3-med-4k-it-pretrain/results/predictions-Phi-3-medium-4k-instruct-pretrained-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  phi-3-medium-4k-instruct
Model Type :  pretrained

Valid Instruction Following Predictions Score :  0.416

Classification Metrics:

Accuracy: 0.154
Precision: 0.446
Recall: 0.154
F1 Score: 0.179

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.298
ROUGE-2: 0.080
ROUGE-L: 0.298
ROUGE-Lsum: 0.297

Average BERT Precision: 0.853
Average BERT Recall: 0.872
Average BERT F1 Score: 0.862


-------------------------------------------------------------------------------


model-name :  phi-3-medium-4k-instruct
model-type :  pretrained

valid-instruction-rating :  0.416

classification-metrics:

accuracy: 0.154
precision: 0.446
recall: 0.154
f1-score: 0.179

text-to-text-metrics:

rouge-scores:
rouge-1: 0.298
rouge-2: 0.080
rouge-L: 0.298
rouge-lsum: 0.297

bert-precision: 0.853
bert-recall: 0.872
bert-f1-score: 0.862

Updated llms_data dictionary:


'success'

In [15]:
t2t_evaluate(model_name='Phi-3-Medium-4K-Instruct'.lower(),model_type = 'fine-tuned', csv_file_path='/kaggle/input/dtest-with-predictions-phi-3-med-4k-it-fine-tuned/results/predictions-Phi-3-medium-4k-instruct-fine-tuned-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  phi-3-medium-4k-instruct
Model Type :  fine-tuned

Valid Instruction Following Predictions Score :  0.968

Classification Metrics:

Accuracy: 0.015
Precision: 0.081
Recall: 0.015
F1 Score: 0.020

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.482
ROUGE-2: 0.222
ROUGE-L: 0.482
ROUGE-Lsum: 0.481

Average BERT Precision: 0.859
Average BERT Recall: 0.942
Average BERT F1 Score: 0.898


-------------------------------------------------------------------------------


model-name :  phi-3-medium-4k-instruct
model-type :  fine-tuned

valid-instruction-rating :  0.968

classification-metrics:

accuracy: 0.015
precision: 0.081
recall: 0.015
f1-score: 0.020

text-to-text-metrics:

rouge-scores:
rouge-1: 0.482
rouge-2: 0.222
rouge-L: 0.482
rouge-lsum: 0.481

bert-precision: 0.859
bert-recall: 0.942
bert-f1-score: 0.898

Updated llms_data dictionary:


'success'

In [16]:
t2t_evaluate(model_name='Medical-LLaMA3-8B'.lower(),model_type = 'pretrained', csv_file_path='/kaggle/input/dtest-with-predictions-medical-llama3-8b-pretrain/results/predictions-Medical-Llama3-8B-pretrained-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  medical-llama3-8b
Model Type :  pretrained

Valid Instruction Following Predictions Score :  0.013

Classification Metrics:

Accuracy: 0.139
Precision: 0.354
Recall: 0.139
F1 Score: 0.157

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.175
ROUGE-2: 0.070
ROUGE-L: 0.174
ROUGE-Lsum: 0.174

Average BERT Precision: 0.855
Average BERT Recall: 0.822
Average BERT F1 Score: 0.838


-------------------------------------------------------------------------------


model-name :  medical-llama3-8b
model-type :  pretrained

valid-instruction-rating :  0.013

classification-metrics:

accuracy: 0.139
precision: 0.354
recall: 0.139
f1-score: 0.157

text-to-text-metrics:

rouge-scores:
rouge-1: 0.175
rouge-2: 0.070
rouge-L: 0.174
rouge-lsum: 0.174

bert-precision: 0.855
bert-recall: 0.822
bert-f1-score: 0.838

Updated llms_data dictionary:


'success'

In [17]:
t2t_evaluate(model_name='Medical-LLaMA3-8B'.lower(),model_type = 'fine-tuned', csv_file_path='/kaggle/input/dtest-with-predictions-medical-llama3-8b-fine-tune/results/predictions-Medical-Llama3-8B-fine-tuned-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  medical-llama3-8b
Model Type :  fine-tuned

Valid Instruction Following Predictions Score :  0.995

Classification Metrics:

Accuracy: 0.057
Precision: 0.204
Recall: 0.057
F1 Score: 0.069

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.487
ROUGE-2: 0.223
ROUGE-L: 0.487
ROUGE-Lsum: 0.487

Average BERT Precision: 0.862
Average BERT Recall: 0.943
Average BERT F1 Score: 0.900


-------------------------------------------------------------------------------


model-name :  medical-llama3-8b
model-type :  fine-tuned

valid-instruction-rating :  0.995

classification-metrics:

accuracy: 0.057
precision: 0.204
recall: 0.057
f1-score: 0.069

text-to-text-metrics:

rouge-scores:
rouge-1: 0.487
rouge-2: 0.223
rouge-L: 0.487
rouge-lsum: 0.487

bert-precision: 0.862
bert-recall: 0.943
bert-f1-score: 0.900

Updated llms_data dictionary:


'success'

In [18]:
t2t_evaluate(model_name='LLaMA-3-8B-Instruct',model_type = 'pretrained', csv_file_path='/kaggle/input/dtest-with-predictions-llama-3-8b-instruct-pretra/results/predictions-llama-3-8b-Instruct-pretrained-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  LLaMA-3-8B-Instruct
Model Type :  pretrained

Valid Instruction Following Predictions Score :  0.751

Classification Metrics:

Accuracy: 0.107
Precision: 0.237
Recall: 0.107
F1 Score: 0.123

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.237
ROUGE-2: 0.071
ROUGE-L: 0.238
ROUGE-Lsum: 0.238

Average BERT Precision: 0.839
Average BERT Recall: 0.869
Average BERT F1 Score: 0.853


-------------------------------------------------------------------------------


model-name :  LLaMA-3-8B-Instruct
model-type :  pretrained

valid-instruction-rating :  0.751

classification-metrics:

accuracy: 0.107
precision: 0.237
recall: 0.107
f1-score: 0.123

text-to-text-metrics:

rouge-scores:
rouge-1: 0.237
rouge-2: 0.071
rouge-L: 0.238
rouge-lsum: 0.238

bert-precision: 0.839
bert-recall: 0.869
bert-f1-score: 0.853

Updated llms_data dictionary:


'success'

In [19]:
t2t_evaluate(model_name='LLaMA-3-8B-Instruct',model_type = 'fine-tuned', csv_file_path='/kaggle/input/dtest-with-predictions-llama-3-8b-inst-fine-tuned/results/predictions-llama-3-8b-Instruct-fine-tuned-output.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Name :  LLaMA-3-8B-Instruct
Model Type :  fine-tuned

Valid Instruction Following Predictions Score :  1.000

Classification Metrics:

Accuracy: 0.961
Precision: 0.955
Recall: 0.961
F1 Score: 0.957

Text to Text Metrics:

ROUGE Scores:
ROUGE-1: 0.966
ROUGE-2: 0.449
ROUGE-L: 0.966
ROUGE-Lsum: 0.966

Average BERT Precision: 0.994
Average BERT Recall: 0.994
Average BERT F1 Score: 0.994


-------------------------------------------------------------------------------


model-name :  LLaMA-3-8B-Instruct
model-type :  fine-tuned

valid-instruction-rating :  1.000

classification-metrics:

accuracy: 0.961
precision: 0.955
recall: 0.961
f1-score: 0.957

text-to-text-metrics:

rouge-scores:
rouge-1: 0.966
rouge-2: 0.449
rouge-L: 0.966
rouge-lsum: 0.966

bert-precision: 0.994
bert-recall: 0.994
bert-f1-score: 0.994

Updated llms_data dictionary:


'success'

In [20]:
# llm_model_data = {}

In [21]:
# llm_model_data["qwen2-7b-instruct"]= {
#     "pretrained": {
#         "valid-instruction-rating": 0.997,
#         "classification-metrics": {
#             "accuracy": 0.050,
#             "precision": 0.253,
#             "recall": 0.050,
#             "f1-score": 0.064
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.200,
#                 "rouge-2": 0.073,
#                 "rouge-L": 0.200,
#                 "rouge-lsum": 0.200
#             },
#             "bert-precision": 0.828,
#             "bert-recall": 0.858,
#             "bert-f1-score": 0.842
#         }
#     },
#     "fine-tuned": {
#         "valid-instruction-rating": 0.998,
#         "classification-metrics": {
#             "accuracy": 0.738,
#             "precision": 0.931,
#             "recall": 0.738,
#             "f1-score": 0.778
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.871,
#                 "rouge-2": 0.411,
#                 "rouge-L": 0.871,
#                 "rouge-lsum": 0.871
#             },
#             "bert-precision": 0.962,
#             "bert-recall": 0.978,
#             "bert-f1-score": 0.970
#         }
#     }
# }

In [22]:
# llm_model_data["gemma-7b-instruct"] = {
#     "pretrained": {
#         "valid-instruction-rating": 0.134,
#         "classification-metrics": {
#             "accuracy": 0.057,
#             "precision": 0.191,
#             "recall": 0.057,
#             "f1-score": 0.074
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.162,
#                 "rouge-2": 0.066,
#                 "rouge-L": 0.162,
#                 "rouge-lsum": 0.162
#             },
#             "bert-precision": 0.838,
#             "bert-recall": 0.842,
#             "bert-f1-score": 0.840
#         }
#     },
#     "fine-tuned": {
#         "valid-instruction-rating": 0.974,
#         "classification-metrics": {
#             "accuracy": 0.078,
#             "precision": 0.288,
#             "recall": 0.078,
#             "f1-score": 0.101
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.461,
#                 "rouge-2": 0.201,
#                 "rouge-L": 0.461,
#                 "rouge-lsum": 0.461
#             },
#             "bert-precision": 0.860,
#             "bert-recall": 0.933,
#             "bert-f1-score": 0.895
#         }
#     }
# }


In [23]:
# llm_model_data["mistral-7b-instruct-v0.2"] = {
#     "pretrained": {
#         "valid-instruction-rating": 0.982,
#         "classification-metrics": {
#             "accuracy": 0.057,
#             "precision": 0.192,
#             "recall": 0.057,
#             "f1-score": 0.065
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.155,
#                 "rouge-2": 0.064,
#                 "rouge-L": 0.155,
#                 "rouge-lsum": 0.154
#             },
#             "bert-precision": 0.835,
#             "bert-recall": 0.842,
#             "bert-f1-score": 0.838
#         }
#     },
#     "fine-tuned": {
#         "valid-instruction-rating": 0.437,
#         "classification-metrics": {
#             "accuracy": 0.383,
#             "precision": 0.482,
#             "recall": 0.383,
#             "f1-score": 0.414
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.715,
#                 "rouge-2": 0.364,
#                 "rouge-L": 0.714,
#                 "rouge-lsum": 0.715
#             },
#             "bert-precision": 0.921,
#             "bert-recall": 0.963,
#             "bert-f1-score": 0.941
#         }
#     }
# }


In [24]:
# llm_model_data["phi-3-medium-4k-instruct"] = {
#     "pretrained": {
#         "valid-instruction-rating": 0.416,
#         "classification-metrics": {
#             "accuracy": 0.154,
#             "precision": 0.446,
#             "recall": 0.154,
#             "f1-score": 0.179
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.298,
#                 "rouge-2": 0.079,
#                 "rouge-L": 0.298,
#                 "rouge-lsum": 0.298
#             },
#             "bert-precision": 0.853,
#             "bert-recall": 0.872,
#             "bert-f1-score": 0.862
#         }
#     },
#     "fine-tuned": {
#         "valid-instruction-rating": 0.968,
#         "classification-metrics": {
#             "accuracy": 0.015,
#             "precision": 0.081,
#             "recall": 0.015,
#             "f1-score": 0.020
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.482,
#                 "rouge-2": 0.222,
#                 "rouge-L": 0.482,
#                 "rouge-lsum": 0.482
#             },
#             "bert-precision": 0.859,
#             "bert-recall": 0.942,
#             "bert-f1-score": 0.898
#         }
#     }
# }


In [25]:
# llm_model_data["medical-llama3-8b"] = {
#     "pretrained": {
#         "valid-instruction-rating": 0.013,
#         "classification-metrics": {
#             "accuracy": 0.139,
#             "precision": 0.354,
#             "recall": 0.139,
#             "f1-score": 0.157
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.174,
#                 "rouge-2": 0.069,
#                 "rouge-L": 0.174,
#                 "rouge-lsum": 0.174
#             },
#             "bert-precision": 0.855,
#             "bert-recall": 0.822,
#             "bert-f1-score": 0.838
#         }
#     },
#     "fine-tuned": {
#         "valid-instruction-rating": 0.995,
#         "classification-metrics": {
#             "accuracy": 0.057,
#             "precision": 0.204,
#             "recall": 0.057,
#             "f1-score": 0.069
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.488,
#                 "rouge-2": 0.223,
#                 "rouge-L": 0.487,
#                 "rouge-lsum": 0.487
#             },
#             "bert-precision": 0.862,
#             "bert-recall": 0.943,
#             "bert-f1-score": 0.900
#         }
#     }
# }


In [26]:
# llm_model_data["llama-3-8b-instruct"] = {
#     "pretrained": {
#         "valid-instruction-rating": 0.751,
#         "classification-metrics": {
#             "accuracy": 0.107,
#             "precision": 0.237,
#             "recall": 0.107,
#             "f1-score": 0.123
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.238,
#                 "rouge-2": 0.071,
#                 "rouge-L": 0.237,
#                 "rouge-lsum": 0.238
#             },
#             "bert-precision": 0.839,
#             "bert-recall": 0.869,
#             "bert-f1-score": 0.853
#         }
#     },
#     "fine-tuned": {
#         "valid-instruction-rating": 1.000,
#         "classification-metrics": {
#             "accuracy": 0.961,
#             "precision": 0.955,
#             "recall": 0.961,
#             "f1-score": 0.957
#         },
#         "text-to-text-metrics": {
#             "rouge-scores": {
#                 "rouge-1": 0.966,
#                 "rouge-2": 0.448,
#                 "rouge-L": 0.966,
#                 "rouge-lsum": 0.966
#             },
#             "bert-precision": 0.994,
#             "bert-recall": 0.994,
#             "bert-f1-score": 0.994
#         }
#     }
# }


In [27]:
import os
import json

# Define the directory path
directory = '/kaggle/working/evaluation'

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Define the file path
file_path = os.path.join(directory, 'medical-chatbot-models-evaluation.json')

# Assuming llms_data contains the dictionary you want to save
with open(file_path, 'w') as json_file:
    json.dump(llms_data, json_file, indent=4)


In [28]:
import csv
# Define file path
file_path = "/kaggle/working/evaluation/medical-chatbot-models-evaluation.csv"

# Create a CSV file and write header
with open(file_path, mode='w', newline='') as file:
    fieldnames = ['model-name', 'type', 'valid-instruction-following', 'accuracy', 'precision', 'recall', 'f1-score', 
                  'rouge-1', 'rouge-2', 'rouge-l', 'rouge-lsum', 'bert-precision', 'bert-recall', 'bert-f1-score']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for model_name, model_types in llms_data.items():
        for model_type, metrics in model_types.items():
            writer.writerow({
                'model-name': model_name,
                'type': model_type,
                'valid-instruction-following': metrics['valid-instruction-rating'],
                'accuracy': metrics['classification-metrics']['accuracy'],
                'precision': metrics['classification-metrics']['precision'],
                'recall': metrics['classification-metrics']['recall'],
                'f1-score': metrics['classification-metrics']['f1-score'],
                'rouge-1': metrics['text-to-text-metrics']['rouge-scores']['rouge-1'],
                'rouge-2': metrics['text-to-text-metrics']['rouge-scores']['rouge-2'],
                'rouge-l': metrics['text-to-text-metrics']['rouge-scores']['rouge-L'],
                'rouge-lsum': metrics['text-to-text-metrics']['rouge-scores']['rouge-lsum'],
                'bert-precision': metrics['text-to-text-metrics']['bert-precision'],
                'bert-recall': metrics['text-to-text-metrics']['bert-recall'],
                'bert-f1-score': metrics['text-to-text-metrics']['bert-f1-score']
            })

print("CSV file created successfully at:", file_path)

CSV file created successfully at: /kaggle/working/evaluation/medical-chatbot-models-evaluation.csv
