In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/final-evaluation/bart_cobracorpus_predictions.csv
/kaggle/input/final-evaluation/refined_phi_counterspeech.csv
/kaggle/input/final-evaluation/refined_mistral_counterspeech.csv


In [2]:
!pip install rouge_score
!pip install bert_score
!pip install detoxify

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=03d33ba0274de694d8a6ab74feca3f927bffc666b579b7d85a1f2753fff18038
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-man

In [7]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from detoxify import Detoxify
from tqdm import tqdm

smoothie = SmoothingFunction().method4
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def evaluate_model(csv_path, model_name):
    df = pd.read_csv(csv_path)
    references = df["counterspeech"].astype(str).tolist()
    predictions = df["refined_counterspeech"].astype(str).tolist()

    bleu_scores = []
    rouge_l_scores = []

    for ref, pred in tqdm(zip(references, predictions), total=len(references), desc=f"Scoring {model_name}"):
        bleu = sentence_bleu([ref.split()], pred.split(), weights=(1, 0, 0, 0), smoothing_function=smoothie)   
        rouge_l = rouge.score(ref, pred)['rougeL'].fmeasure

        bleu_scores.append(bleu)
        rouge_l_scores.append(rouge_l)

    P,F1,R = bert_score(predictions, references, lang="en", verbose=False)
    P,F1,R = P.tolist(),F1.tolist(),R.tolist()

    # toxicity = Detoxify("original").predict(predictions)["toxicity"]

    result = {
        "Model": model_name,
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-L": sum(rouge_l_scores) / len(rouge_l_scores),
        "BERTScore-F1": sum(F1) / len(F1),
        # "Toxicity": sum(toxicity) / len(toxicity),
    }

    return result

# Example run for Mistral
mistral_result = evaluate_model("/kaggle/input/final-evaluation/refined_mistral_counterspeech.csv", "Mistral")

# Show/save results
results_df = pd.DataFrame([mistral_result])
print("\nFinal Evaluation Results:\n")
print(results_df)

results_df.to_csv("mistral_evaluation_results.csv", index=False)


Scoring Mistral: 100%|██████████| 2971/2971 [00:05<00:00, 525.63it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Final Evaluation Results:

     Model      BLEU   ROUGE-L  BERTScore-F1
0  Mistral  0.153323  0.166738      0.875891


In [14]:
import pandas as pd

df = pd.read_csv('/kaggle/input/final-evaluation/refined_mistral_counterspeech.csv')

informative = df[df['csType'] == 'Informative'].head(10)
questioning = df[df['csType'] == 'Questioning'].head(10)
denouncing = df[df['csType'] == 'Denouncing'].head(10)
positive = df[df['csType'] == 'Positive'].head(10)

combined_df = pd.concat([informative, questioning, denouncing, positive])

combined_df.to_csv('mistral_human.csv', index=False)

print(f"Saved {len(combined_df)} rows (10 per csType) to 'mistral_human.csv'")
print("csType counts in output file:")
print(combined_df['csType'].value_counts())

Saved 40 rows (10 per csType) to 'mistral_human.csv'
csType counts in output file:
csType
Informative    10
Questioning    10
Denouncing     10
Positive       10
Name: count, dtype: int64


In [15]:
import pandas as pd

df = pd.read_csv('/kaggle/input/final-evaluation/bart_cobracorpus_predictions.csv')

informative = df[df['csType'] == 'Informative'].head(10)
questioning = df[df['csType'] == 'Questioning'].head(10)
denouncing = df[df['csType'] == 'Denouncing'].head(10)
positive = df[df['csType'] == 'Positive'].head(10)

combined_df = pd.concat([informative, questioning, denouncing, positive])

combined_df.to_csv('Bartex_human.csv', index=False)

print(f"Saved {len(combined_df)} rows (10 per csType) to 'Bartex_human.csv'")
print("csType counts in output file:")
print(combined_df['csType'].value_counts())

Saved 40 rows (10 per csType) to 'Bartex_human.csv'
csType counts in output file:
csType
Informative    10
Questioning    10
Denouncing     10
Positive       10
Name: count, dtype: int64
