In [1]:
import pandas as pd

# calculate the accuracy
def accuracy(data):
    # divide data into 3 parts by label
    data_0 = data[data['label'] == 'funny']
    data_1 = data[data['label'] == 'somewhat_funny']
    data_2 = data[data['label'] == 'not_funny']

    # calculate the accuracy for each data part
    accuracy_0 = len(data_0[data_0['label'] == data_0['rating']]) / len(data_0)
    accuracy_1 = len(data_1[data_1['label'] == data_1['rating']]) / len(data_1)
    accuracy_2 = len(data_2[data_2['label'] == data_2['rating']]) / len(data_2)

    # create a dictionary to store the accuracy
    accuracy = {'funny': accuracy_0, 'somewhat_funny': accuracy_1, 'not_funny': accuracy_2}

    return accuracy

# calculate the F1 score
def F1_score_Calculation(data, class_name):
    # calculate ture positive for funny
    tp = len(data[(data['label'] == class_name) & (data['rating'] == class_name)])
    # calculate false positive for funny
    fp = len(data[(data['label'] != class_name) & (data['rating'] == class_name)])
    # calculate false negative for funny
    fn = len(data[(data['label'] == class_name) & (data['rating'] != class_name)])
    # calculate precision for funny
    if tp == 0 and fp == 0:
        precision = 0
    else:
        precision = tp / (tp + fp)
    # calculate recall for funny
    if tp == 0 and fn == 0:
        recall = 0
    else:
        recall = tp / (tp + fn)
    # calculate F1 score for funny
    if precision == 0 and recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return f1

In [5]:
# Read the results from the csv file
df = pd.read_csv('response directly/caption_ratings_GPT4_only_test_2.csv')

# read the test file
test = pd.read_csv('../../data/data_of_contest_866/866_test.csv')

In [6]:
# calculate the ratio of each label in the test file
ratio = test['label'].value_counts() / len(test)
print(ratio)

not_funny         0.924855
somewhat_funny    0.065511
funny             0.009634
Name: label, dtype: float64


In [7]:
# merge the two dataframes by caption
merged = test.merge(df, on="caption")
merged.head()

Unnamed: 0,caption,mean,label,rating
0,She used to think I was the center of the univ...,1.699167,funny,funny
1,I remember the day that Mars said “You’ll regr...,1.65165,funny,funny
2,Could I get some more ice?,1.639924,funny,somewhat_funny
3,It's five o'clock somewhere.,1.637793,funny,not_funny
4,Everything I drink goes to my equator.”,1.636842,funny,somewhat_funny


In [8]:
# calculate the accuracy
accuracy = accuracy(merged)
print(accuracy)

{'funny': 0.2, 'somewhat_funny': 0.5441176470588235, 'not_funny': 0.10520833333333333}


In [9]:
f1_funny = F1_score_Calculation(merged, 'funny')
f1_somewhat_funny = F1_score_Calculation(merged, 'somewhat_funny')
f1_not_funny = F1_score_Calculation(merged, 'not_funny')
F1_score = {'funny': f1_funny, 'somewhat_funny': f1_somewhat_funny, 'not_funny': f1_not_funny}
print(F1_score)


{'funny': 0.009237875288683603, 'somewhat_funny': 0.1302816901408451, 'not_funny': 0.18790697674418602}


In [10]:
# calculate the finally F1 score with weighted-averaging
F1_score = 0.01 * F1_score['funny'] + 0.07 * F1_score['somewhat_funny'] + 0.92 * F1_score['not_funny']
print(F1_score)

0.18208651566739714
