## importing libraries

In [4]:
import re
import pickle
import scipy.stats as stats
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
from getDialect import detectDialect
import matplotlib.pyplot as plt
from itertools import combinations
from qna_simulation import answer_extractor
from qna_simulation import run_simulation
from utils import extract_model_accuracy
from utils import build_politeness_classifier
from utils import predict_politeness
from utils import get_readability_score
from utils import categorize_score
from utils import calculate_entropy
from utils import create_readability_plot
from utils import create_ling_marker_df
from scipy.stats import ttest_ind, ttest_rel, pearsonr, spearmanr, zscore, norm

## Q&A simulation (one question per subject from MMLU)
#### This might take 3-5 minutes

In [None]:
## loading dataset names from pickle file
dataset_name = "bigbench"

if dataset_name == 'bigbench':
    category_names = ['navigate', 'tracking_shuffled_objects_three_objects','temporal_sequences', 'date_understanding', 'penguins_in_a_table','causal_judgement']
else:
    with open("dataset_name.pkl", "rb") as f:
        category_names = pickle.load(f)



'''
important note:
parameter "aave" = True means you want to change the whole question prompt to AAVE 
However, the "aave_instruct" = True just means you only want to change the instruction part of the question prompt to AAVE, 
the acutal question remains SAE. "aave" and "aave_instruct" can not both be True.
'''

df_regular = run_simulation(dataset_name = dataset_name, category_names =category_names,   model_name = "gpt-3.5", aave= True, n_run = 1, aave_instruct = False, converter_type = "both")

df_regular

## loading the Q&A simulation dataset from 7 different models

In [2]:

gpt3_path = "mmlu_dataset/gpt3/"
gpt4_path = "mmlu_dataset/gpt4/"
llama31_path = "mmlu_dataset/llama3.1/"
qwen_path = "mmlu_dataset/qwen2.5/"
llama32_path = "mmlu_dataset/llama3.2/"
gemma_path = "mmlu_dataset/gemma2/"
mistral_path = "mmlu_dataset/mistral/"

bigbench_gpt3_path = "bigbench_dataset/gpt-3.5/"
bigbench_gpt4_path = "bigbench_dataset/gpt-4/"
bigbench_llama31_path = "bigbench_dataset/llama3.1/"
bigbench_qwen_path = "bigbench_dataset/qwen2.5/"
bigbench_llama32_path = "bigbench_dataset/llama3.2/"
bigbench_gemma_path = "bigbench_dataset/gemma2/"
bigbench_mistral_path = "bigbench_dataset/mistral/"


df_bigbench_regular_gpt3 = pd.read_csv(bigbench_gpt3_path+'regular_bigbench_qna.csv')
df_bigbench_aave_gpt3 = pd.read_csv(bigbench_gpt3_path+'aave_llm_bigbench_qna.csv')

df_bigbench_regular_gpt4 = pd.read_csv(bigbench_gpt4_path+'regular_bigbench_qna.csv')
df_bigbench_aave_gpt4 = pd.read_csv(bigbench_gpt4_path+'aave_llm_bigbench_qna.csv')

df_bigbench_regular_llama31 = pd.read_csv(bigbench_llama31_path+'regular_bigbench_qna.csv')
df_bigbench_aave_llama31 = pd.read_csv(bigbench_llama31_path+'aave_llm_bigbench_qna.csv')

df_bigbench_regular_llama32 = pd.read_csv(bigbench_llama32_path+'regular_bigbench_qna.csv')
df_bigbench_aave_llama32 = pd.read_csv(bigbench_llama32_path+'aave_llm_bigbench_qna.csv')

df_bigbench_regular_qwen = pd.read_csv(bigbench_qwen_path+'regular_bigbench_qna.csv')
df_bigbench_aave_qwen = pd.read_csv(bigbench_qwen_path+'aave_llm_bigbench_qna.csv')

df_bigbench_regular_gemma = pd.read_csv(bigbench_gemma_path+'regular_bigbench_qna.csv')
df_bigbench_aave_gemma = pd.read_csv(bigbench_gemma_path+'aave_llm_bigbench_qna.csv')

df_bigbench_regular_mistral = pd.read_csv(bigbench_gemma_path+'regular_bigbench_qna.csv')
df_bigbench_aave_mistral = pd.read_csv(bigbench_gemma_path+'aave_llm_bigbench_qna.csv')


df_regular_gpt3 = pd.read_csv(gpt3_path+'regular_mmlu_qna.csv')
df_phonate_gpt3 = pd.read_csv(gpt3_path+'aave_phonate_mmlu_qna.csv')
df_llm_gpt3 = pd.read_csv(gpt3_path+'aave_llm_mmlu_qna.csv')
df_multivalue_gpt3 =  pd.read_csv(gpt3_path+'aave_multi_value_mmlu_qna.csv')
df_multi_phonate_gpt3 = pd.read_csv(gpt3_path+'aave_multi_phonate_mmlu_qna.csv')

df_regular_gpt4 = pd.read_csv(gpt4_path+'regular_mmlu_qna.csv')
df_phonate_gpt4 = pd.read_csv(gpt4_path+'aave_phonate_mmlu_qna.csv')
df_multivalue_gpt4 = pd.read_csv(gpt4_path+'aave_multi_value_mmlu_qna.csv')
df_llm_gpt4 = pd.read_csv(gpt4_path+'aave_llm_mmlu_qna.csv')
df_multi_phonate_gpt4 = pd.read_csv(gpt4_path+'aave_multi_phonate_mmlu_qna.csv')

df_regular_llama31 = pd.read_csv(llama31_path + "regular_mmlu_qna.csv")
df_phonate_llama31 = pd.read_csv(llama31_path + "aave_phonate_mmlu_qna.csv")
df_llm_llama31 = pd.read_csv(llama31_path + "aave_llm_mmlu_qna.csv")
df_multi_phonate_llama31 = pd.read_csv(llama31_path + "aave_multi_phonate_mmlu_qna.csv")
df_multivalue_llama31 = pd.read_csv(llama31_path + "aave_multi_value_mmlu_qna.csv")

df_regular_llama32 = pd.read_csv(llama32_path + "regular_mmlu_qna.csv")
df_phonate_llama32 = pd.read_csv(llama32_path + "aave_phonate_mmlu_qna.csv")
df_llm_llama32 = pd.read_csv(llama32_path + "aave_llm_mmlu_qna.csv")
df_multi_phonate_llama32 = pd.read_csv(llama32_path + "aave_multi_phonate_mmlu_qna.csv")
df_multivalue_llama32 = pd.read_csv(llama32_path + "aave_multi_value_mmlu_qna.csv")

df_regular_qwen = pd.read_csv(qwen_path + "regular_mmlu_qna.csv")
df_phonate_qwen = pd.read_csv(qwen_path + "aave_phonate_mmlu_qna.csv")
df_llm_qwen = pd.read_csv(qwen_path + "aave_llm_mmlu_qna.csv")
df_multi_phonate_qwen = pd.read_csv(qwen_path + "aave_multi_phonate_mmlu_qna.csv")
df_multivalue_qwen = pd.read_csv(qwen_path + "aave_multi_value_mmlu_qna.csv")

df_regular_gemma2 = pd.read_csv(gemma_path + "regular_mmlu_qna.csv")
df_phonate_gemma2 = pd.read_csv(gemma_path + "aave_phonate_mmlu_qna.csv")
df_llm_gemma2 = pd.read_csv(gemma_path + "aave_llm_mmlu_qna.csv")
df_multi_phonate_gemma2 = pd.read_csv(gemma_path + "aave_multi_phonate_mmlu_qna.csv")
df_multivalue_gemma2 = pd.read_csv(gemma_path + "aave_multi_value_mmlu_qna.csv")

df_regular_mistral = pd.read_csv(mistral_path + "regular_mmlu_qna.csv")
df_phonate_mistral = pd.read_csv(mistral_path + "aave_phonate_mmlu_qna.csv")
df_llm_mistral = pd.read_csv(mistral_path + "aave_llm_mmlu_qna.csv")
df_multi_phonate_mistral = pd.read_csv(mistral_path + "aave_multi_phonate_mmlu_qna.csv")
df_multivalue_mistral = pd.read_csv(mistral_path + "aave_multi_value_mmlu_qna.csv")

In [3]:
print(extract_model_accuracy(df_regular_llama31))
print(extract_model_accuracy(df_llm_llama31))


0.6568421052631579
0.5298245614035088


In [5]:

matches_regular_gpt4 = extract_model_accuracy(df_regular_gpt4)
matches_phonate_gpt4 = extract_model_accuracy(df_phonate_gpt4)
matches_llm_gpt4 =  extract_model_accuracy(df_llm_gpt4)
matches_multi_value_gpt4 =  extract_model_accuracy(df_multi_phonate_gpt4)
matches_multi_phonate_gpt4 =  extract_model_accuracy(df_multivalue_gpt4)

print(f"the accuracy of sae question from gpt4 is: {matches_regular_gpt4}")
print(f"the accuracy of aave phonate question from gpt4 is: {matches_phonate_gpt4}")
print(f"the accuracy of aave llm question from gpt4 is: {matches_llm_gpt4}")
print(f"the accuracy of aave multivalue question from gpt4 is: {matches_multi_value_gpt4}")
print(f"the accuracy of aave multivalue + phonate question from gpt4 is: {matches_multi_phonate_gpt4}")

the accuracy of sae question from gpt4 is: 0.8259649122807018
the accuracy of aave phonate question from gpt4 is: 0.7785964912280702
the accuracy of aave llm question from gpt4 is: 0.7228070175438597
the accuracy of aave multivalue question from gpt4 is: 0.8031578947368421
the accuracy of aave multivalue + phonate question from gpt4 is: 0.8270175438596491


## Experiment 1: Politeness Classification for LLM Answers
#### This process might take more than 5 mins 

In [None]:
clf_polite = build_politeness_classifier()
politeness_classification_gpt4= predict_politeness(clf_polite, df_regular_gpt4)
print(f"polite answer for gpt4: {politeness_classification_gpt4[0]}")
print(f"neutral answer for gpt4: {politeness_classification_gpt4[1]}")

#### To same your time, this is the complete politeness score for all models and dialects converters . 

In [None]:
df_politeness_score = pd.read_csv('result/politeness_score.csv')
df_politeness_score

## Experiment 2: Readability for LLM Answers

In [None]:
from scipy.stats import sem, t
n = len(aave_flesh_score_gpt4)
standard_error = sem(aave_flesh_score_gpt4)
confidence = 0.95
t_value = t.ppf((1 + confidence) / 2, n - 1)
margin_of_error = t_value * standard_error
margin_of_error

In [None]:
reg_flesh_score_gpt4 = get_readability_score(df_regular_mistral)
aave_flesh_score_gpt4 = get_readability_score(df_llm_mistral)

In [None]:
print(sum(reg_flesh_score_gpt4)/len(reg_flesh_score_gpt4))
print(sum(aave_flesh_score_gpt4)/len(aave_flesh_score_gpt4))

#### After we get the flesch kincaid score, we want to classify each score into the corresponding grade level

In [None]:
grade_level_regular = np.array([categorize_score(score) for score in reg_flesh_score_gpt4])
print("sae answer grade level")
print(np.unique(grade_level_regular, return_counts=True))
grade_level_aave = np.array([categorize_score(score) for score in aave_flesh_score_gpt4])
print("aave answer grade level")
print(np.unique(grade_level_aave, return_counts=True))

#### Again to same your time, I have run all the readability process and here is the result for gpt4 specifically. 

In [None]:
df_readability = pd.read_csv('result/readability_v1.csv')

create_readability_plot(df_readability, 'llama3.1')

## Experiment 3: Linguistic Marker Analysis 

### load LIWC token parser

In [6]:

import liwc
parse, category_names = liwc.load_token_parser('LIWC2007_English100131.dic')

### I have already put together the average token count for the linguistc marker 

In [12]:
df_ling = pd.read_csv('result/linguistic_marker.csv')

In [13]:
df_ling

Unnamed: 0,model,dialect,ppron,i,you,we,they,social,posemo,negemo,tentat,certain,percept
0,gpt4,sae,16.105875,0.616029,2.728548,5.184828,5.631425,46.00825,21.59431,11.945476,28.805472,15.818917,8.447137
1,gpt4,aave,23.978082,0.888688,6.629263,7.422873,6.845614,55.181119,23.346492,11.540331,31.084675,15.671372,10.698212
2,gpt3,sae,20.274755,1.812316,4.364309,6.260919,6.027004,53.242043,24.360895,12.131979,32.979932,18.540411,8.686471
3,gpt3,aave,24.720154,2.215846,6.008119,6.987354,7.399558,59.45384,25.122304,12.064497,36.89122,17.62623,10.568499
4,llama3.1,sae,26.976371,2.810486,4.046316,10.915251,7.016414,57.608893,26.491,10.645497,37.784281,18.3769,12.044673
5,llama3.1,aave,34.407346,3.785347,8.297436,12.158041,8.117716,66.624358,29.436972,9.79248,39.239561,17.340709,14.278734
6,llama3.2,sae,20.309254,1.319671,2.08549,8.285891,6.512199,50.437406,23.873049,11.465409,34.620502,18.508211,10.661299
7,llama3.2,aave,26.166607,1.885467,5.277357,9.347904,7.585069,58.363086,26.839689,11.025733,37.78599,17.810907,13.301393
8,gemma2,sae,23.744831,1.485979,5.393096,6.932512,7.836842,54.607151,32.148935,14.403512,31.394641,17.352861,12.685285
9,gemma2,aave,32.449175,2.356255,10.250553,7.602933,9.750165,64.292395,34.832877,14.386666,33.259929,16.964612,15.24387


## Experiment 4: Uncertainty in the Answer Produced by LLMs.

### Sample of the uncertainty df:
### this is one of the sample repetition of the same set of question using llama3.1 model. 
### It contains the key information such as subject, question text and answer text. More importantly, it has letter answer and correct answer

In [23]:

uncertainty_llama31_df = pd.read_csv('uncertainty_exp/llama3.1/aave_uncertainty_0.csv')
uncertainty_llama31_df.head() 

Unnamed: 0,subject,question,answer,letter_answer,pure_question,correct_answer
0,abstract_algebra,Sup? You know how to help me with this multipl...,To determine the degree of the field extension...,C,Find the degree for the given field extension ...,B
1,anatomy,You can help me with dis multiple choice quest...,I can't answer this question because it requir...,B,A lesion causing compression of the facial ner...,A
2,astronomy,"'Ay, can you help me out with dis multiple cho...","I gotcha! Alright, so let's break down this mu...",A,"What is true for a type-Ia (""type one-a"") supe...",A
3,business_ethics,"'Hey, can you help me with this multiple choic...","I'd be happy to help you out.\n\nOkay, let's b...",A,_______ such as bitcoin are becoming increasin...,C
4,clinical_knowledge,"'What's good, fam? I'm stuck on this multiple ...",I can't help with this request. I can’t provid...,B,What size of cannula would you use in a patien...,A


### this is the higher level statistics of the uncertainty estimation of the answer produced by different LLMS 

In [21]:
uncertainty_df = pd.read_csv('result/uncertainty.csv')
uncertainty_df

Unnamed: 0,model,dialect,entropy,f1 bert score,precision bert score,recall bert score,accurate percentage
0,gemma2,sae,0.489671,0.870655,0.871015,0.870442,0.775
1,gemma2,aave,0.958558,0.835147,0.835622,0.835038,0.666071
2,llama3.1,sae,0.702298,0.848292,0.848651,0.848128,0.723214
3,llama3.1,aave,1.090501,0.812461,0.811925,0.81445,0.583929
4,llama3.2,sae,0.973586,0.851286,0.85087,0.851913,0.607143
5,llama3.2,aave,1.248636,0.826271,0.827092,0.826196,0.485714
6,mistral,sae,0.714853,0.848005,0.848,0.84828,0.592857
7,mistral,aave,1.098438,0.8304,0.830605,0.830462,0.505357
8,qwen2.5,sae,0.412867,0.867275,0.867652,0.867093,0.785714
9,qwen2.5,aave,0.846436,0.84944,0.849118,0.849953,0.6625


## Extra Data: Integrating Big Bench 

In [25]:
df_big_bench = pd.read_csv('result/bigbench_hard.csv')
df_big_bench.head()

Unnamed: 0,question,answer,dataset_name,category
0,"If you follow these instructions, do you retur...",B,BigBench_hard,navigate
1,"If you follow these instructions, do you retur...",B,BigBench_hard,navigate
2,"If you follow these instructions, do you retur...",B,BigBench_hard,navigate
3,"If you follow these instructions, do you retur...",A,BigBench_hard,navigate
4,"If you follow these instructions, do you retur...",B,BigBench_hard,navigate


In [26]:
df_big_bench['category'].unique()

array(['navigate', 'tracking_shuffled_objects_three_objects',
       'temporal_sequences', 'date_understanding', 'penguins_in_a_table',
       'causal_judgement'], dtype=object)