In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
# TODO: Fill in the path where you download the Assignment folder into
ROOT_PATH = "/content/drive/MyDrive/EPFL/Modern_NLP_2023/project-m2-ikun" # Replace with your directory to A3 folder
print(os.listdir(ROOT_PATH))

['README.md', '.gitignore', 'evaluate.py', 'model.py', 'data.json', 'm2_reward_dataset_example.json', '.gitattributes', 'M2_Report.pdf', '.DS_Store', '.idea', '.git', 'models', 'dataset', 'notebooks', 'artifacts']


In [4]:
%cd /content/drive/MyDrive/EPFL/Modern_NLP_2023/project-m2-ikun

/content/drive/MyDrive/EPFL/Modern_NLP_2023/project-m2-ikun


In [5]:
import sys
sys.path.append(ROOT_PATH)

In [6]:
!pip install transformers xformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers
  Downloading xformers-0.0.20-cp310-cp310-manylinux2014_x86_64.whl (109.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [7]:
import json
import matplotlib.pyplot as plt

from artifacts.data_processing_utils import *
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import pipeline

nli_model = pipeline('text-classification', model='facebook/bart-large-mnli')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
with open(f"{ROOT_PATH}/dataset/solution_positive.json", "r") as f:
    solutions_positive = json.load(f)

with open(f"{ROOT_PATH}/dataset/clean_interaction.json", "r") as f:
    interactions = json.load(f)

In [9]:
# Convert the keys to integers
solutions_positive = {int(k): v for k, v in solutions_positive.items()}
interactions = {int(k): {int(kk): vv for kk, vv in v.items()} for k, v in interactions.items()}

In [10]:
import pandas as pd

In [11]:
def transform_interactions(samples):
    """Index samples by sol_id and interaction_id, and also count the roles separately.

    Args:
        samples (list): A list of samples, each containing a sol_id and an interaction_id.

    Returns:
        dict, dict: Two nested dictionaries where the first level of keys are sol_id values and the second level of keys are interaction_id values.
                    The first dictionary contains the samples and the second dictionary contains the role counts.
    """

    # Initialize the dictionaries
    indexed_samples = {}
    role_freq = {}

    for sol_id, pairs in samples.items():
        for interaction_id, sample in pairs.items():

            # If this sol_id is not already in the dictionary, add it
            if sol_id not in indexed_samples:
                indexed_samples[sol_id] = {}
                role_freq[sol_id] = {}

            # Add the sample and role count to the dictionaries, indexed by sol_id and interaction_id
            indexed_samples[sol_id][interaction_id] = sample
            role_freq[sol_id][interaction_id] = count_role_freq(sample)

    # Convert role_freq to a DataFrame to facilitate analysis
    role_freq_df = pd.DataFrame.from_dict({(i, j): role_freq[i][j]
                                           for i in role_freq.keys()
                                           for j in role_freq[i].keys()},
                                          orient='index')

    return indexed_samples, role_freq_df

In [12]:
indexed_interactions, role_freq = transform_interactions(interactions)

### Deal with single round interactions

In [13]:
indices_single_round = role_freq[role_freq['assistant'] == 1].index.to_list()
interactions_single_round = [indexed_interactions[sol_id][interaction_id] for sol_id, interaction_id in indices_single_round]

In [14]:
import pandas as pd

premises_hypotheses = []
interaction_ids = []
sol_ids = []

for interaction in interactions_single_round:
    sol_id = interaction['sol_id']
    interaction_id = interaction['interaction_id']

    try:
        sol_instance = solutions_positive[sol_id]['interaction'][-1]
    except KeyError:
        continue  # Skip the rest of the loop for this iteration

    interaction_instance = interaction['interaction'][-1]

    assert sol_instance['role'] == 'assistant', "Role does not match for solution"
    assert interaction_instance['role'] == 'assistant', "Role does not match for interaction"

    premise = sol_instance['content']
    hypothesis = interaction_instance['content']

    # Combine the premise and hypothesis
    sequence = premise + ' ' + hypothesis

    # Skip sequences that have more than one </s> token
    if sequence.count("</s>") > 1:
        continue

    premises_hypotheses.append(sequence)
    interaction_ids.append(interaction_id)
    sol_ids.append(sol_id)

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

# check if CUDA is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# move the model to the device
model = model.to(device)

# define a batch size
batch_size = 16

# split premises_hypotheses into chunks of size batch_size
chunks = [premises_hypotheses[i:i + batch_size] for i in range(0, len(premises_hypotheses), batch_size)]


In [16]:
from tqdm import tqdm

all_probs = []

# process each chunk separately
for chunk in tqdm(chunks):
    # tokenize the premises and hypotheses
    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)

    # remove 'token_type_ids' if present as BART does not use them
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    # move the inputs to the device
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    # get the model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # outputs.logits contains the model's predictions
    logits = outputs.logits

    # apply softmax to calculate probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # move probs to cpu and convert to numpy array
    probs = probs.cpu().numpy()

    # add probabilities to the overall list
    all_probs.extend(probs)

100%|██████████| 350/350 [05:29<00:00,  1.06it/s]


In [18]:
import numpy as np

# Convert all_probs to a numpy array for easier processing
all_probs_np = np.array(all_probs)

# Take the argmax along axis 1 to get the predicted class for each input
predictions = np.argmax(all_probs_np, axis=1)

# Count the occurrences of each class
# According to https://huggingface.co/facebook/bart-large-mnli
# 0 is contradiction, 1 is Netural, 2 is Entailment
contradiction_count = np.sum(predictions == 0)
neutral_count = np.sum(predictions == 1)
entailment_count = np.sum(predictions == 2)

print(f"Contradiction: {contradiction_count}")
print(f"Neutral: {neutral_count}")
print(f"Entailment: {entailment_count}")

Contradiction: 710
Neutral: 2855
Entailment: 2020


In [19]:
# Indices of 'entailment' interactions
entailment_indices      = np.where(predictions == 2)[0]  # here, 2 is the index for 'entailment'
netural_indices         = np.where(predictions == 1)[0]  # here, 1 is the index for 'netural'
contradiction_indices   = np.where(predictions == 0)[0]  # here, 0 is the index for 'contradiction'


# Collect the sol_id and interaction_id pairs for 'entailment' interactions
entailment_pairs    = [(sol_ids[i], interaction_ids[i]) for i in entailment_indices]
netural_pairs       = [(sol_ids[i], interaction_ids[i]) for i in netural_indices]
contradiction_pairs = [(sol_ids[i], interaction_ids[i]) for i in contradiction_indices]

#### Generate Generative Model Training Dataset

In [None]:
processed_data = []
for sol_id, interaction_id in entailment_pairs:
    processed_data.append(indexed_interactions[sol_id][interaction_id])

In [None]:
processed_data[0]

{'confidence': 5,
 'interaction': [{'role': 'system',
   'content': 'You are given a question to answer. Firstly provide important information regarding the answer to the question. Explain in logical steps, and finally present the answer. Provide the entirety of the answer in as simple steps as possible. Use the formatting: <answer>\n\nConfidence:<number between 1 (low confidence) to 5 (full confidence) with your confidence in the answer provided being correct>'},
  {'role': 'user',
   'content': 'Rods (peak response at $\\lambda=507 \\mathrm{~nm}$ ) and cones (peak response at $555 \\mathrm{~nm}$ ) are the photosensitive cells in human eye. Although\n\nrods are more sensitive, they cannot register colors (unlike cones).\n\nGiven that the sensitivity of cone cells is $1 / 220$ of the rod cells, find the threshold values for cone cells.'},
  {'role': 'assistant',
   'content': "Threshold values refer to the minimum amount of light needed to activate a photosensitive cell. \n\nWe know th

In [None]:
qa_list_single_round = []

for sample in processed_data:
    interaction = sample['interaction']
    if len(interaction) == 3:
        qa_list_single_round.append({
            'question': sample['interaction'][1]['content'],
            'answer': sample['interaction'][2]['content']
        })
    elif len(interaction) == 2:
        qa_list_single_round.append({
            'question': sample['interaction'][0]['content'],
            'answer': sample['interaction'][1]['content']
        })

#### Generate Reward Model Training Pairs

In [98]:
def populate_dict(pairs):
    """
    Populates a dictionary with sol_id as keys and a list of interaction_ids as values.
    """
    result_dict = {}
    for sol_id, interaction_id in pairs:
        if sol_id in result_dict:
            result_dict[sol_id].append(interaction_id)
        else:
            result_dict[sol_id] = [interaction_id]
    return result_dict


def create_chosen_reject_pair(chosen_dict, reject_dict, interactions, solutions):
    """
    Pair reject interactions with chosen interactions,
    as well as pairs where 'chosen' is solution and 'rejected' is interactions in reject_dict.
    """
    rm_training_pairs = []

    for sol_id, reject_interaction_ids in reject_dict.items():
        if sol_id in solutions:
            for reject_interaction_id in reject_interaction_ids:
                rm_training_pairs.append({
                        "chosen": combine_interaction(solutions[sol_id]['interaction']),
                        "rejected": combine_interaction(interactions[sol_id][reject_interaction_id]['interaction']),
                    })

        if sol_id in chosen_dict:
            chosen_interaction_ids = chosen_dict[sol_id]
            for reject_interaction_id in reject_interaction_ids:
                for chosen_interaction_id in chosen_interaction_ids:
                    rm_training_pairs.append({
                        "chosen": combine_interaction(interactions[sol_id][chosen_interaction_id]['interaction']),
                        "rejected": combine_interaction(interactions[sol_id][reject_interaction_id]['interaction']),
                    })

    return rm_training_pairs

In [100]:
# Main execution
contradiction_dict = populate_dict(contradiction_pairs)
netural_dict = populate_dict(netural_pairs)
entailment_dict = populate_dict(entailment_pairs)  # Not used in this example but kept for consistency

rm_training_pairs = \
    create_chosen_reject_pair(
        chosen_dict=netural_dict, reject_dict=contradiction_dict,
        interactions=indexed_interactions, solutions=solutions_positive
    ) + \
    create_chosen_reject_pair(
        chosen_dict=entailment_dict, reject_dict=contradiction_dict,
        interactions=indexed_interactions, solutions=solutions_positive
    )


len(rm_training_pairs)

2110

### Deal with multi-rounds interactions

In [101]:
indices_multi_round_S1 = role_freq[(role_freq['assistant'] != 1) & (role_freq['system'] == 1)].index.to_list()
interactions_multi_round_S1 = [indexed_interactions[sol_id][interaction_id] for sol_id, interaction_id in indices_multi_round_S1]

In [102]:
len(indices_multi_round_S1)

2769

In [103]:
premises_hypotheses = []
interaction_ids = []
sol_ids = []
round_ids = []

for interaction in interactions_multi_round_S1:
    sol_id = interaction['sol_id']
    interaction_id = interaction['interaction_id']

    try:
        solutions_positive[sol_id]
    except KeyError:
        continue  # Skip the rest of the loop for this iteration

    # GPT Response
    hypothesis = interaction['interaction'][-1]['content']

    for idx, round in enumerate(interaction['interaction']):

        if round['role'] == 'assistant':
            premise = round['content']

            # Combine the premise and hypothesis
            sequence = premise + ' ' + hypothesis

            # Check if the sequence contains more than one "</s>" token
            if sequence.count("</s>") > 1:
                continue

            # Append the sequence, sol_id, interaction_id, and round_id to their respective lists
            premises_hypotheses.append(sequence)
            sol_ids.append(sol_id)
            interaction_ids.append(interaction_id)
            round_ids.append(idx)

In [104]:
# define a batch size
batch_size = 16

# split premises_hypotheses into chunks of size batch_size
chunks = [premises_hypotheses[i:i + batch_size] for i in range(0, len(premises_hypotheses), batch_size)]

all_probs = []

# process each chunk separately
for chunk in tqdm(chunks):
    # tokenize the premises and hypotheses
    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)

    # remove 'token_type_ids' if present as BART does not use them
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    # move the inputs to the device
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    # get the model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # outputs.logits contains the model's predictions
    logits = outputs.logits

    # apply softmax to calculate probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # move probs to cpu and convert to numpy array
    probs = probs.cpu().numpy()

    # add probabilities to the overall list
    all_probs.extend(probs)

100%|██████████| 478/478 [07:54<00:00,  1.01it/s]


#### Generate Generative Model Dataset

In [105]:
# Convert all_probs to numpy array for easier manipulation
all_probs_np = np.array(all_probs)

# Separate the probabilities of each class
contradiction_probs = all_probs_np[:, 0]
neutral_probs = all_probs_np[:, 1]
entailment_probs = all_probs_np[:, 2]

# Create a DataFrame
df = pd.DataFrame({
    'sol_id': sol_ids,
    'interaction_id': interaction_ids,
    'round_id': round_ids,
    'contradiction_prob': contradiction_probs,
    'neutral_prob': neutral_probs,
    'entailment_prob': entailment_probs,
})

In [None]:
df

Unnamed: 0,sol_id,interaction_id,round_id,contradiction_prob,neutral_prob,entailment_prob
0,1693719,4415287,2,0.000957,0.994739,0.004304
1,1693719,4415287,4,0.000370,0.020722,0.978908
2,2001870,4759584,2,0.000338,0.993547,0.006115
3,2001870,4759584,4,0.000163,0.998449,0.001388
4,2001870,4759584,6,0.000419,0.976042,0.023539
...,...,...,...,...,...,...
7636,2577878,4309477,4,0.025459,0.211967,0.762573
7637,1584454,4646036,2,0.094699,0.438155,0.467146
7638,1584454,4646036,4,0.008453,0.134488,0.857059
7639,2690724,4680840,2,0.226877,0.070591,0.702532


In [None]:
def get_content(row):
    sol_id = row['sol_id']
    interaction_id = row['interaction_id']
    round_id = row['round_id']

    user_content = indexed_interactions[sol_id][interaction_id]['interaction'][int(round_id)-1]['content']
    assistant_content = indexed_interactions[sol_id][interaction_id]['interaction'][int(round_id)]['content']
    system_content = indexed_interactions[sol_id][interaction_id]['interaction'][0]['content']
    question = solutions_positive[sol_id]['interaction'][1]['content']

    return pd.Series({
        'user_content': user_content,
        'assistant_content': assistant_content,
        'system_content': system_content,
        'question': question
    })

# Apply the function to each row of the DataFrame
content_df = df.apply(get_content, axis=1)

# Join the content DataFrame with the original DataFrame
df = pd.concat([df, content_df], axis=1)

In [None]:
# get rid of stupid user input asking about confidence
df = df[~df['user_content'].str.contains('confidence|scale', case=False)]

In [None]:
# This is the python code to execute the operations you've described:
df_grouped = df.groupby(['sol_id', 'interaction_id']).sum().reset_index()

# Filter the rows where 'entailment_prob' is greater than 'contradiction_prob'
df_filtered = df_grouped[(df_grouped['entailment_prob'] > df_grouped['neutral_prob']) &
                         (df_grouped['entailment_prob'] > df_grouped['contradiction_prob'])]

# Print the result
print(df_filtered[['sol_id', 'interaction_id']])

       sol_id  interaction_id
0     1001199         4039807
1     1001199         4967894
2     1002413         4195984
3     1003280         4202957
4     1005186         4320012
...       ...             ...
2669  2998595         4837798
2671  2999488         4646915
2672  2999633         4198754
2673  2999633         4405298
2674  2999633         4967912

[1888 rows x 2 columns]


  df_grouped = df.groupby(['sol_id', 'interaction_id']).sum().reset_index()


In [None]:
# Merge df with df_filtered
df_matching = pd.merge(df, df_filtered[['sol_id', 'interaction_id']], on=['sol_id', 'interaction_id'], how='inner')
df_matching

Unnamed: 0,sol_id,interaction_id,round_id,contradiction_prob,neutral_prob,entailment_prob,user_content,assistant_content,system_content,question
0,2473993,4831239,2,0.000430,0.042415,0.957155,Shifting the focus to another typical element ...,Reflection is the process by which a wavefront...,You are given a question to answer. Firstly pr...,Shifting the focus to another typical element ...
1,2473993,4831239,4,0.005739,0.025277,0.968984,Could you give an example of a system which he...,One example of a system that can help in intui...,You are given a question to answer. Firstly pr...,Shifting the focus to another typical element ...
2,2473993,4184160,2,0.003889,0.266725,0.729386,Context: Shifting the focus to another typical...,Reflection is the process by which light or an...,You are an expert in biomedical imaging. Given...,Shifting the focus to another typical element ...
3,2473993,4184160,6,0.003597,0.044169,0.952234,But you just gave the definitions without actu...,"You are correct, let me clarify the difference...",You are an expert in biomedical imaging. Given...,Shifting the focus to another typical element ...
4,1520494,4203440,2,0.001232,0.045800,0.952968,Question: Why one should interpret FRAP result...,FRAP (Fluorescence Recovery After Photobleachi...,You are an expert in biomedical imaging. Answe...,Why one should interpret FRAP results with cau...
...,...,...,...,...,...,...,...,...,...,...
4923,2577878,4309477,4,0.025459,0.211967,0.762573,"Therefore, among A through E, the answer is",(D) R >= kT/(mu*m_(mu)),You are a Physics teacher at EPFL. You are cre...,For a planet with a mean density of [mathjaxin...
4924,1584454,4646036,2,0.094699,0.438155,0.467146,Question: A star escapes with the velocity V_s...,"First, we need to calculate the relative veloc...",You are a Physics teacher at EPFL. You are cre...,A star escapes with the velocity [mathjaxinlin...
4925,1584454,4646036,4,0.008453,0.134488,0.857059,Are you sure?,"I apologize, there was an error in my previous...",You are a Physics teacher at EPFL. You are cre...,A star escapes with the velocity [mathjaxinlin...
4926,2690724,4680840,2,0.226877,0.070591,0.702532,Question: Using the definition of the centre o...,"First, we need to remember the definition of t...",You are a Physics teacher at EPFL. You are cre...,Using the definition of the centre of mass for...


In [None]:
df_filtered = df_matching[df_matching['entailment_prob'] > 0.9]

In [None]:
df_filtered

Unnamed: 0,sol_id,interaction_id,round_id,contradiction_prob,neutral_prob,entailment_prob,user_content,assistant_content,system_content,question
0,2473993,4831239,2,0.000430,0.042415,0.957155,Shifting the focus to another typical element ...,Reflection is the process by which a wavefront...,You are given a question to answer. Firstly pr...,Shifting the focus to another typical element ...
1,2473993,4831239,4,0.005739,0.025277,0.968984,Could you give an example of a system which he...,One example of a system that can help in intui...,You are given a question to answer. Firstly pr...,Shifting the focus to another typical element ...
3,2473993,4184160,6,0.003597,0.044169,0.952234,But you just gave the definitions without actu...,"You are correct, let me clarify the difference...",You are an expert in biomedical imaging. Given...,Shifting the focus to another typical element ...
4,1520494,4203440,2,0.001232,0.045800,0.952968,Question: Why one should interpret FRAP result...,FRAP (Fluorescence Recovery After Photobleachi...,You are an expert in biomedical imaging. Answe...,Why one should interpret FRAP results with cau...
7,1520494,4203440,10,0.000579,0.028848,0.970573,But the options C and D is not too related to ...,"Sure, I apologize for any confusion. Here's th...",You are an expert in biomedical imaging. Answe...,Why one should interpret FRAP results with cau...
...,...,...,...,...,...,...,...,...,...,...
4915,1153638,4047939,4,0.000172,0.015665,0.984162,"So, which options are correct?",The correct options are Solution 1 and Solutio...,You are going to answer a multiple-choice exam...,The purpose of preconditioning the function in...
4917,1733133,4618337,4,0.001738,0.020944,0.977319,"So, in short, which options are correct?",The correct options are Solution 1 and Solutio...,You are going to answer a multiple-choice exam...,If the feasible domain of a linear optimizatio...
4919,2046963,4465897,4,0.000605,0.017380,0.982015,"So, which options are correct?",The correct options are:\n\n1. Solution 1: The...,You are going to answer a multiple-choice exam...,Consider Dijkstra’s algorithm to calculate the...
4921,2713145,4398724,4,0.002034,0.023973,0.973992,"So, which options are correct?",Options 1 and 4 are correct.,You are going to answer a multiple-choice exam...,(e) Assume now that the particle carries a cha...


In [None]:
# Define a function to modify the user_content of the first row of each group
def modify_user_content(group):
    if group['round_id'].iloc[0] != 2:
        group['user_content'].iloc[0] = group['question'].iloc[0] + ' ' + group['user_content'].iloc[0]
    return group

# Sort df_filtered by sol_id, interaction_id, and round_id
df_filtered = df_filtered.sort_values(by=['sol_id', 'interaction_id', 'round_id'])

# Group by sol_id and interaction_id, then apply the modify_user_content function to each group
df_filtered = df_filtered.groupby(['sol_id', 'interaction_id']).apply(modify_user_content)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_filtered = df_filtered.groupby(['sol_id', 'interaction_id']).apply(modify_user_content)


In [None]:
df_filtered

Unnamed: 0,sol_id,interaction_id,round_id,contradiction_prob,neutral_prob,entailment_prob,user_content,assistant_content,system_content,question
1506,1001199,4039807,4,0.004514,0.057804,0.937683,Pour encoder la séquence de lettres ABCCDDDDEE...,"Pour encoder une séquence de lettres, on doit ...",QCM,Pour encoder la séquence de lettres ABCCDDDDEE...
4499,1002413,4195984,4,0.000893,0.044884,0.954223,Consider the Gaussian mixture model where $n$ ...,The number of non-zero rows in the low-rank ma...,You are going to assist a student with technic...,Consider the Gaussian mixture model where $n$ ...
3415,1005186,4320012,8,0.000461,0.024597,0.974942,Shotgun connectomics could be used to address ...,Option 1) It generates more data than multi-pa...,Which of the following answer is correct? More...,Shotgun connectomics could be used to address ...
3416,1005186,4669952,4,0.000447,0.054591,0.944961,Shotgun connectomics could be used to address ...,"Yes, I am confident that my answer is correct....",Select the correct answer and detail why you c...,Shotgun connectomics could be used to address ...
4866,1005379,4487546,4,0.000538,0.003740,0.995722,Which of the following is correct regarding pr...,The correct statement regarding prediction mod...,You are going to assist the following user in ...,Which of the following is correct regarding pr...
...,...,...,...,...,...,...,...,...,...,...
3478,2998595,4775507,4,0.000406,0.021030,0.978564,A muscle is controlled by a group of motor neu...,"Yes, I am sure about my answer. Injecting retr...",Select the correct answer and detail why you c...,A muscle is controlled by a group of motor neu...
3481,2998595,4837798,6,0.000490,0.037957,0.961554,A muscle is controlled by a group of motor neu...,Option 3 is not entirely incorrect and can be ...,Please select the correct answer and provide t...,A muscle is controlled by a group of motor neu...
1614,2999488,4646915,4,0.005939,0.049653,0.944409,On prend $1 m^3$ d'air à l'extérieur en hiver:...,Thank you for the additional information. With...,Answer in english. Show your thinking steps. W...,On prend $1 m^3$ d'air à l'extérieur en hiver:...
3719,2999633,4198754,2,0.038309,0.041200,0.920492,"Q: Parmi les séquences de lettres suivantes, l...","Sure, let's do this step by step. \n\nFirst, w...",Act as a computer science tutor.,"Parmi les séquences de lettres suivantes, laqu..."


In [None]:
# Sort df_filtered by round_id
df_filtered = df_filtered.sort_values(by='round_id')

# Group by sol_id and interaction_id, then join the user_content and assistant_content
grouped = df_filtered.groupby(['sol_id', 'interaction_id'])
user_content = grouped['user_content'].apply(' '.join)
assistant_content = grouped['assistant_content'].apply(' '.join)

# Create a dict to store the question and answer
qa_list_multi_rounds = []
for (sol_id, interaction_id), user_content, assistant_content in zip(user_content.index, user_content.values, assistant_content.values):
    qa_list_multi_rounds.append({
        'question': user_content,
        'answer': assistant_content
    })


In [None]:
save_path = f'{ROOT_PATH}/dataset/NLI_filtered_data.json'

In [None]:
with open(save_path, 'w') as file:
    json.dump(qa_list_single_round + qa_list_multi_rounds, file, indent=4, sort_keys=True)
print(f"Dataset successfully saved to path: {save_path}")

Dataset successfully saved to path: /content/drive/MyDrive/EPFL/Modern_NLP_2023/project-m2-ikun/dataset/NLI_filtered_data.json


#### Generate Reward Model training pairs

In [107]:
# Convert all_probs to a numpy array for easier processing
all_probs_np = np.array(all_probs)

# Take the argmax along axis 1 to get the predicted class for each input
predictions = np.argmax(all_probs_np, axis=1)

# Count the occurrences of each class
# According to https://huggingface.co/facebook/bart-large-mnli
# 0 is contradiction, 1 is Netural, 2 is Entailment
contradiction_count = np.sum(predictions == 0)
neutral_count = np.sum(predictions == 1)
entailment_count = np.sum(predictions == 2)

print(f"Contradiction: {contradiction_count}")
print(f"Neutral: {neutral_count}")
print(f"Entailment: {entailment_count}")

Contradiction: 364
Neutral: 2489
Entailment: 4788


In [108]:
# Indices of 'entailment' interactions
entailment_indices      = np.where(predictions == 2)[0]  # here, 2 is the index for 'entailment'
netural_indices         = np.where(predictions == 1)[0]  # here, 1 is the index for 'netural'
contradiction_indices   = np.where(predictions == 0)[0]  # here, 0 is the index for 'contradiction'

# Collect the sol_id and interaction_id pairs for 'entailment' interactions
entailment_pairs    = [(sol_ids[i], interaction_ids[i]) for i in entailment_indices]
netural_pairs       = [(sol_ids[i], interaction_ids[i]) for i in netural_indices]
contradiction_pairs = [(sol_ids[i], interaction_ids[i]) for i in contradiction_indices]

In [109]:
len(rm_training_pairs)

2110

In [112]:
# Main execution
contradiction_dict = populate_dict(contradiction_pairs)
netural_dict = populate_dict(netural_pairs)
entailment_dict = populate_dict(entailment_pairs)  # Not used in this example but kept for consistency

print(len(create_chosen_reject_pair(
        chosen_dict=netural_dict, reject_dict=contradiction_dict,
        interactions=indexed_interactions, solutions=solutions_positive
    )))

print(len(create_chosen_reject_pair(
        chosen_dict=entailment_dict, reject_dict=contradiction_dict,
        interactions=indexed_interactions, solutions=solutions_positive
    )))

rm_training_pairs += \
    create_chosen_reject_pair(
        chosen_dict=netural_dict, reject_dict=contradiction_dict,
        interactions=indexed_interactions, solutions=solutions_positive
    ) + \
    create_chosen_reject_pair(
        chosen_dict=entailment_dict, reject_dict=contradiction_dict,
        interactions=indexed_interactions, solutions=solutions_positive
    )

len(rm_training_pairs)

712
1395


4217

In [114]:
ROOT_PATH

'/content/drive/MyDrive/EPFL/Modern_NLP_2023/project-m2-ikun'

In [116]:
save_path = "/content/drive/MyDrive/EPFL/Modern_NLP_2023/project-m3-ikun/dataset/NLI_rm_data.json"

In [117]:
import json

with open(save_path, 'w') as file:
    json.dump(rm_training_pairs, file, indent=4, sort_keys=True)
print(f"Dataset successfully saved to path: {save_path}")

Dataset successfully saved to path: /content/drive/MyDrive/EPFL/Modern_NLP_2023/project-m3-ikun/dataset/NLI_rm_data.json
