In [None]:
import os
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
root_dir = 'path_to_root_folder'  # Replace with your root folder path

In [None]:
model_responses = []
reference_responses = []

In [None]:
for patient_folder in os.listdir(root_dir):
    patient_path = os.path.join(root_dir, patient_folder)
    if os.path.isdir(patient_path):
        print(f"Processing {patient_folder}...")

        # Paths to the response files
        model_file = os.path.join(patient_path, 'model_responses.txt')
        reference_file = os.path.join(patient_path, 'reference_responses.txt')

        # Check if both files exist
        if os.path.exists(model_file) and os.path.exists(reference_file):
            with open(model_file, 'r') as f:
                model_texts = f.readlines()
            with open(reference_file, 'r') as f:
                reference_texts = f.readlines()

            # Ensure both files have the same number of responses
            min_length = min(len(model_texts), len(reference_texts))
            model_responses.extend(model_texts[:min_length])
            reference_responses.extend(reference_texts[:min_length])
        else:
            print(f"Warning: Missing files in {patient_folder}")

In [None]:
# Initialize metrics lists
bleu_scores = []
rouge1_scores = []
rougeL_scores = []
bert_f1_scores = []
cosine_similarities = []

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Calculate Metrics
for ref, pred in zip(reference_responses, model_responses):
    # BLEU Score
    bleu_scores.append(sentence_bleu([ref.split()], pred.split()))

    # ROUGE Scores
    rouge_scores = scorer.score(ref, pred)
    rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
    rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

# Calculate BERTScore
P, R, F1 = score(model_responses, reference_responses, lang='en', verbose=True)
bert_f1_scores = F1.tolist()

# Calculate Cosine Similarities
all_text = reference_responses + model_responses
tfidf_matrix = vectorizer.fit_transform(all_text)
cosine_sim_matrix = cosine_similarity(tfidf_matrix[:len(reference_responses)], tfidf_matrix[len(reference_responses):])
cosine_similarities = [cosine_sim_matrix[i, i] for i in range(len(reference_responses))]

In [None]:
x = range(1, len(reference_responses) + 1)

In [None]:
# BLEU Score Plot
plt.figure()
plt.plot(x, bleu_scores, label='BLEU Score')
plt.title('BLEU Score Per Response')
plt.xlabel('Response Index')
plt.ylabel('BLEU Score')
plt.legend()
plt.show()

# ROUGE-1 and ROUGE-L Plot
plt.figure()
plt.plot(x, rouge1_scores, label='ROUGE-1 F1')
plt.plot(x, rougeL_scores, label='ROUGE-L F1')
plt.title('ROUGE Scores Per Response')
plt.xlabel('Response Index')
plt.ylabel('ROUGE F1 Score')
plt.legend()
plt.show()

# BERTScore F1 Plot
plt.figure()
plt.plot(x, bert_f1_scores, label='BERTScore F1')
plt.title('BERTScore F1 Per Response')
plt.xlabel('Response Index')
plt.ylabel('BERTScore F1')
plt.legend()
plt.show()

# Cosine Similarity Plot
plt.figure()
plt.plot(x, cosine_similarities, label='Cosine Similarity')
plt.title('Cosine Similarity Per Response')
plt.xlabel('Response Index')
plt.ylabel('Cosine Similarity')
plt.legend()
plt.show()

In [None]:
# Summary Metrics
print("Average BLEU Score:", np.mean(bleu_scores))
print("Average ROUGE-1 F1:", np.mean(rouge1_scores))
print("Average ROUGE-L F1:", np.mean(rougeL_scores))
print("Average BERTScore F1:", np.mean(bert_f1_scores))
print("Average Cosine Similarity:", np.mean(cosine_similarities))

In [None]:
# Assuming x, bleu_scores, rouge1_scores, rougeL_scores, bert_f1_scores, and cosine_similarities are already defined
data = {
    'Response Index': x,
    'BLEU Score': bleu_scores,
    'ROUGE-1 F1': rouge1_scores,
    'ROUGE-L F1': rougeL_scores,
    'BERTScore F1': bert_f1_scores,
    'Cosine Similarity': cosine_similarities
}

df = pd.DataFrame(data)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
axes = axes.flatten()

axes[0].plot(df['Response Index'], df['BLEU Score'])
axes[0].set_title('BLEU Score')
axes[0].set_xlabel('Response Index')
axes[0].set_ylabel('BLEU Score')
axes[0].grid(True)

axes[1].plot(df['Response Index'], df['ROUGE-1 F1'])
axes[1].set_title('ROUGE-1 F1')
axes[1].set_xlabel('Response Index')
axes[1].set_ylabel('ROUGE-1 F1')
axes[1].grid(True)

axes[2].plot(df['Response Index'], df['ROUGE-L F1'])
axes[2].set_title('ROUGE-L F1')
axes[2].set_xlabel('Response Index')
axes[2].set_ylabel('ROUGE-L F1')
axes[2].grid(True)

axes[3].plot(df['Response Index'], df['BERTScore F1'])
axes[3].set_title('BERTScore F1')
axes[3].set_xlabel('Response Index')
axes[3].set_ylabel('BERTScore F1')
axes[3].grid(True)

axes[4].plot(df['Response Index'], df['Cosine Similarity'])
axes[4].set_title('Cosine Similarity')
axes[4].set_xlabel('Response Index')
axes[4].set_ylabel('Cosine Similarity')
axes[4].grid(True)

plt.tight_layout()
plt.show()

In [None]:
sns.set_palette('Set1')
sns.boxplot(data=df[['BLEU Score', 'ROUGE-1 F1', 'ROUGE-L F1', 'BERTScore F1', 'Cosine Similarity']])
plt.title('Distribution of Metrics')
plt.show()

In [None]:
averages = df.mean()
sns.barplot(x=averages.index, y=averages.values)
plt.title('Average Scores of Metrics')
plt.xlabel('Metrics')
plt.ylabel('Average Score')
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = px.line(df, x='Response Index', y=['BLEU Score', 'ROUGE-1 F1', 'ROUGE-L F1', 'BERTScore F1', 'Cosine Similarity'],
              title='Metrics Over Responses')
fig.show()

**Version 2**

In [None]:
import os
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

In [None]:
root_dir = 'path_to_root_folder'  # Replace with your root folder path

In [None]:
all_model_responses = []
all_reference_responses = []

In [None]:
for patient_folder in os.listdir(root_dir):
    patient_path = os.path.join(root_dir, patient_folder)
    if os.path.isdir(patient_path):
        print(f"Processing {patient_folder}...")

        # Paths to the response files
        model_file = os.path.join(patient_path, 'model_responses.txt')
        reference_file = os.path.join(patient_path, 'reference_responses.txt')

        # Check if both files exist
        if os.path.exists(model_file) and os.path.exists(reference_file):
            with open(model_file, 'r') as f:
                model_texts = f.readlines()
            with open(reference_file, 'r') as f:
                reference_texts = f.readlines()

            # Ensure both files have the same number of responses
            min_length = min(len(model_texts), len(reference_texts))
            model_responses = model_texts[:min_length]
            reference_responses = reference_texts[:min_length]

            # Calculate metrics for this patient
            bleu_scores = [sentence_bleu([ref.split()], pred.split()) for ref, pred in zip(reference_responses, model_responses)]
            rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
            rouge1_scores = [rouge_scorer_obj.score(ref, pred)['rouge1'].fmeasure for ref, pred in zip(reference_responses, model_responses)]
            rougeL_scores = [rouge_scorer_obj.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(reference_responses, model_responses)]
            P, R, F1 = score(model_responses, reference_responses, lang='en', verbose=False)
            bert_f1_scores = F1.tolist()
            vectorizer = TfidfVectorizer()
            all_text = reference_responses + model_responses
            tfidf_matrix = vectorizer.fit_transform(all_text)
            cosine_sim_matrix = cosine_similarity(tfidf_matrix[:len(reference_responses)], tfidf_matrix[len(reference_responses):])
            cosine_similarities = [cosine_sim_matrix[i, i] for i in range(len(reference_responses))]

            # Create plots folder if it doesn't exist
            plots_folder = os.path.join(patient_path, 'plots')
            if not os.path.exists(plots_folder):
                os.makedirs(plots_folder)

            # Save BLEU Score Plot
            plt.figure()
            plt.plot(range(1, len(bleu_scores) + 1), bleu_scores)
            plt.title('BLEU Score Per Response')
            plt.xlabel('Response Index')
            plt.ylabel('BLEU Score')
            plt.savefig(os.path.join(plots_folder, 'bleu_scores.png'))
            plt.close()

            # Save ROUGE-1 and ROUGE-L Plot
            plt.figure()
            plt.plot(range(1, len(rouge1_scores) + 1), rouge1_scores, label='ROUGE-1 F1')
            plt.plot(range(1, len(rougeL_scores) + 1), rougeL_scores, label='ROUGE-L F1')
            plt.title('ROUGE Scores Per Response')
            plt.xlabel('Response Index')
            plt.ylabel('ROUGE F1 Score')
            plt.legend()
            plt.savefig(os.path.join(plots_folder, 'rouge_scores.png'))
            plt.close()

            # Save BERTScore F1 Plot
            plt.figure()
            plt.plot(range(1, len(bert_f1_scores) + 1), bert_f1_scores)
            plt.title('BERTScore F1 Per Response')
            plt.xlabel('Response Index')
            plt.ylabel('BERTScore F1')
            plt.savefig(os.path.join(plots_folder, 'bert_f1_scores.png'))
            plt.close()

            # Save Cosine Similarity Plot
            plt.figure()
            plt.plot(range(1, len(cosine_similarities) + 1), cosine_similarities)
            plt.title('Cosine Similarity Per Response')
            plt.xlabel('Response Index')
            plt.ylabel('Cosine Similarity')
            plt.savefig(os.path.join(plots_folder, 'cosine_similarities.png'))
            plt.close()

            # Append this patient's metrics to the overall lists
            all_model_responses.extend(model_responses)
            all_reference_responses.extend(reference_responses)
        else:
            print(f"Warning: Missing files in {patient_folder}")

In [None]:
# Compute overall metrics
overall_bleu_scores = [sentence_bleu([ref.split()], pred.split()) for ref, pred in zip(all_reference_responses, all_model_responses)]
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
overall_rouge1_scores = [rouge_scorer_obj.score(ref, pred)['rouge1'].fmeasure for ref, pred in zip(all_reference_responses, all_model_responses)]
overall_rougeL_scores = [rouge_scorer_obj.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(all_reference_responses, all_model_responses)]
P, R, F1 = score(all_model_responses, all_reference_responses, lang='en', verbose=False)
overall_bert_f1_scores = F1.tolist()
vectorizer = TfidfVectorizer()
all_text = all_reference_responses + all_model_responses
tfidf_matrix = vectorizer.fit_transform(all_text)
cosine_sim_matrix = cosine_similarity(tfidf_matrix[:len(all_reference_responses)], tfidf_matrix[len(all_model_responses):])
overall_cosine_similarities = [cosine_sim_matrix[i, i] for i in range(len(all_reference_responses))]

# Create dataframe for overall metrics
data = {
    'BLEU Score': overall_bleu_scores,
    'ROUGE-1 F1': overall_rouge1_scores,
    'ROUGE-L F1': overall_rougeL_scores,
    'BERTScore F1': overall_bert_f1_scores,
    'Cosine Similarity': overall_cosine_similarities
}
df = pd.DataFrame(data)

# Create 'llama_11b_statistics' folder if it doesn't exist
statistics_folder = os.path.join(root_dir, 'llama_11b_statistics')
if not os.path.exists(statistics_folder):
    os.makedirs(statistics_folder)

# Save Box Plot for Metric Distributions
plt.figure()
df.boxplot()
plt.title('Distribution of Metrics')
plt.savefig(os.path.join(statistics_folder, 'metric_distributions.png'))
plt.close()

# Save Bar Chart for Average Scores
averages = df.mean()
plt.figure()
averages.plot(kind='bar')
plt.title('Average Scores of Metrics')
plt.xlabel('Metrics')
plt.ylabel('Average Score')
plt.xticks(rotation=45)
plt.savefig(os.path.join(statistics_folder, 'average_scores.png'))
plt.close()