In [77]:
!pip install -qr "../requirements.txt"

81728.20s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [78]:
from datasets import load_dataset

import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

In [79]:
dataset = load_dataset('ai2_arc', 'ARC-Easy')

In [80]:
dataset['test']

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 2376
})

In [81]:
train_counts = pd.DataFrame(dataset['train'])['answerKey'].value_counts()
validation_counts = pd.DataFrame(dataset['validation'])['answerKey'].value_counts()

key_mappings = {'1': 'A', '2': 'B', '3': 'C', '4': 'D'}

for counts in [train_counts, validation_counts]:
    for key_mapping, value_mapping in key_mappings.items():
        counts[value_mapping] = counts[key_mapping] + counts[value_mapping] 
        del counts[key_mapping]
    counts.drop('E', axis=0, errors='ignore', inplace=True)
    
(train_counts + validation_counts).to_dict()

{'A': 706, 'B': 740, 'C': 720, 'D': 653}

Because the answerkey B appears most if you combine the train and test set. This will be used as the first baseline

In [82]:
def calculate_accuarcy_of_choice(dataset, choice_letter, choice_number):
    correct_answer_key = 0
    total_answer_keys = 0

    for question in dataset:
        if len(question['choices']['label']) == 4:
            total_answer_keys += 1
            if question['answerKey'] == choice_letter or question['answerKey'] == choice_number:
                correct_answer_key += 1

    accuracy_answer_c = correct_answer_key / total_answer_keys

    return correct_answer_key, accuracy_answer_c

In [83]:
amount_correct_test_b, accuracy_correct_test_b = calculate_accuarcy_of_choice(dataset['test'], 'B', '2')
amount_correct_validation_b, accuracy_correct_validation_b = calculate_accuarcy_of_choice(dataset['validation'], 'B', '2')
amounnt_correct_train_b, accuracy_correct_train_b = calculate_accuarcy_of_choice(dataset['train'], 'B', '2')

In [84]:
print(f'Accuracy of choice B in test set: {accuracy_correct_test_b}')
print(f'Accuracy of choice B in validation set: {accuracy_correct_validation_b}')
print(f'Accuracy of choice B in train set: {accuracy_correct_train_b}')

Accuracy of choice B in test set: 0.2456659619450317
Accuracy of choice B in validation set: 0.26455026455026454
Accuracy of choice B in train set: 0.26193663543061135


In [85]:
accuracy_values = [accuracy_correct_test_b, 0.2773784355179704, 0.289, 0.5168, 0.546, 0.552, 0.9478, 78/80]
accuracy_percentages = [f'{acc*100:.2f}%' for acc in accuracy_values]

x_labels = ['Answer B', 'Word<br>Embedding', 'Randomly<br>Initialized<br>Transformer', 'Pretrained<br>Transformer', 'Finetuned<br>Transformer', 'Ensemble<br>Transformer', 'Leaderboard<br>Leader<br>(Google Brain)', 'LLM<br>(Hugging chat)']

fig = go.Figure(data=[
    go.Bar(name='Accuracy', x=x_labels, y=accuracy_values, marker_color='burlywood')
])
fig.update_traces(text=accuracy_percentages, textposition='outside')
fig.update_layout(
    yaxis=dict(title='Accuracy', range=[0, 1.05], showgrid=False), 
    xaxis=dict(tickangle=0),  
    plot_bgcolor='white',
    title_text="Accuracy on ARC Easy Test Set",
    autosize=False,
    width=800,  
    height=500  
)

for y in [i * 0.2 for i in range(8)]:
    fig.add_shape(
        type="line",
        x0=0,
        y0=y,
        x1=1,
        y1=y,
        xref='paper',
        yref='y',
        line=dict(
            color="Black",
            width=0.5,
            dash="dot",
        ),
        layer="below"
    )

fig.show()

In [86]:
import plotly.graph_objects as go

validation_scores = [accuracy_correct_validation_b, 0.2822, 0.2544, 0.5168, 0.546, 0.552]
test_scores = [accuracy_correct_test_b, 0.2773784355179704, 0.289, 0.528, 0.556, 0.532]

x_labels = ['Answer B', 'Word<br>Embedding', 'Randomly<br>Initialized<br>Transformer', 'Pretrained<br>Transformer', 'Finetuned<br>Transformer', 'Ensemble<br>Transformer']

fig = go.Figure()

fig.update_layout(
    yaxis=dict(title='Accuracy', range=[0, 1.05], showgrid=False), 
    xaxis=dict(tickangle=0),  
    plot_bgcolor='white',
    title_text="Comparison of Validation and Test Accuracy on different models",
    autosize=False,
    width=800,  
    height=500  
)

for y in [i * 0.2 for i in range(10)]:  # Adjust the range as needed
    fig.add_shape(
        type="line",
        x0=0,
        y0=y,
        x1=1,
        y1=y,
        xref="paper",
        yref="y",
        line=dict(
            color="LightGrey",
            width=1,
            dash="dot",
        ),
        layer="below"
    )

fig.add_trace(go.Bar(name='Validation', x=x_labels, y=validation_scores, marker_color='chocolate'))
fig.add_trace(go.Bar(name='Test', x=x_labels, y=test_scores, marker_color='burlywood'))

val_accuracy_percentages = [f'{acc*100:.2f}%' for acc in validation_scores]
test_accuracy_percentages = [f'{acc*100:.2f}%' for acc in test_scores]

fig.update_traces(text=val_accuracy_percentages, textposition='outside', selector=dict(name='Validation'))
fig.update_traces(text=test_accuracy_percentages, textposition='outside', selector=dict(name='Test'))

fig.show()


In [75]:
import plotly.graph_objects as go

training_scores = [accuracy_correct_train_b, 0.3079, 0.2544, 0.9531, 0.7086]


fig = go.Figure()

fig.update_layout(
    yaxis=dict(title='Accuracy', range=[0, 1.05], showgrid=False), 
    xaxis=dict(tickangle=0),  
    plot_bgcolor='white',
    title_text="Comparison of Training and Validation Accuracy on different models",
    autosize=False,
    width=800,  
    height=500  
)

for y in [i * 0.2 for i in range(6)]:  # Adjust the range as needed
    fig.add_shape(
        type="line",
        x0=0,
        y0=y,
        x1=1,
        y1=y,
        xref="paper",
        yref="y",
        line=dict(
            color="LightGrey",
            width=1,
            dash="dot",
        ),
        layer="below"
    )

fig.add_trace(go.Bar(name='Train', x=x_labels[:5], y=training_scores, marker_color='cadetblue'))
fig.add_trace(go.Bar(name='Validation', x=x_labels[:5], y=validation_scores, marker_color='chocolate'))

train_accuracy_percentages = [f'{acc*100:.2f}%' for acc in training_scores]
val_accuracy_percentages = [f'{acc*100:.2f}%' for acc in validation_scores]

fig.update_traces(text=train_accuracy_percentages, textposition='outside', selector=dict(name='Train'))
fig.update_traces(text=val_accuracy_percentages, textposition='outside', selector=dict(name='Validation'))

fig.show()


In [14]:
import plotly.graph_objects as go

correct_answers = [77, 77, 77, 77, 76, 78]
accuracy_values = [correct_answers/80 for correct_answers in correct_answers]

# Add line breaks to the x-axis labels
x_labels = ['Llama<br>70B', 'bart', 'gpt<br>3.5', 'bing<br>chat', 'perplexity<br>ai', 'hugging<br>chat']

fig = go.Figure(data=[
    go.Bar(name='Accuracy', x=x_labels, y=correct_answers, marker_color='burlywood')
])

fig.update_layout(
    yaxis=dict(title='Correct Answers', range=[75, 80], showgrid=False),  # Adjust the range of the y-axis
    xaxis=dict(tickangle=0),  # Adjust the orientation of the x-axis labels
    plot_bgcolor='white',
    title_text="LLM Evaluation: Correct Answers on 80 Questions (ARC Easy Test Set)",
    autosize=False,
    width=800,  # Adjust the width of the plot
    height=500  # Adjust the height if needed
)

for y in range(75, 81):
    fig.add_shape(
        type="line",
        x0=0,
        y0=y,
        x1=1,
        y1=y,
        xref='paper',
        yref='y',
        line=dict(
            color="Black",
            width=0.5,
            dash="dot",
        ),
        layer="below"
    )

# Displaying accuracy values on top of bars
accuracy_percentages = [f'{acc*100:.1f}%' for acc in accuracy_values]
fig.update_traces(text=accuracy_percentages, textposition='outside')

fig.show()
