# Load data

In [5]:
from modelscope.msdatasets import MsDataset

ms_dataset = MsDataset.load(
            'R-Bench', namespace='lcysyzxdxc',
            subset_name='default', split='test')



# Inference

In [2]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

name='MiniCPM'
path='openbmb/MiniCPM-V-2'

model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True, torch_dtype=torch.bfloat16)
# For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
model = model.to(device='cuda', dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MiniCPMV(
  (llm): MiniCPMForCausalLM(
    (model): MiniCPMModel(
      (embed_tokens): Embedding(122753, 2304)
      (layers): ModuleList(
        (0-39): 40 x MiniCPMDecoderLayer(
          (self_attn): MiniCPMAttention(
            (q_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (k_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (v_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (o_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (rotary_emb): MiniCPMRotaryEmbedding()
          )
          (mlp): MiniCPMMLP(
            (gate_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (up_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (down_proj): Linear(in_features=5760, out_features=2304, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): MiniCPMRMSNorm()
          (post_attention_layernorm): MiniCPM

In [None]:
def inference(question,choice,task,byte):
    
    if task=='MCQ':
        prompt = question + "\n" + choice +"\nAnswer with the option's letter from the given choices directly."
    elif task=='VQA':
        prompt = question+". Please answer no more than 10 words"
    elif task=='CAP':
        prompt =  "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'"
    else:
        raise ValueError('No task named'+task)
        return
                         
    image_file = io.BytesIO(byte)
    image = Image.open(image_file).convert('RGB')    

    answer, context, _ = model.chat(
        image=image,
        msgs=[{'role': 'user', 'content': prompt}],
        context=None,
        tokenizer=tokenizer,
        sampling=True,
        temperature=0.7
    )
                         
    return answer

In [None]:
results = pd.DataFrame(columns=['name','answer','type','distortion','strength'])

for num in range(len(val)):

    answer=inference(question=ms_dataset[num]['question'],choice=ms_dataset[num]['choice'],task=ms_dataset[num]['type'],image=ms_dataset[num]['ref_image']['bytes'])
    
    results = results.append({'name': val['name'][num], 'answer': answer, 'type': val['type'][num], 'distortion': val['distortion'][num], 'strength': val['strength'][num]}, ignore_index=True)
    if num%100==99:
        print(str(num+1) +'reference item finished!')

results.to_csv('./exp/'+model_name+'_ref.csv')

results = pd.DataFrame(columns=['name','answer','type','distortion','strength'])

for num in range(len(val)):

    answer=inference(question=ms_dataset[num]['question'],choice=ms_dataset[num]['choice'],task=ms_dataset[num]['type'],image=ms_dataset[num]['dis_image']['bytes'])
    
    results = results.append({'name': val['name'][num], 'answer': answer, 'type': val['type'][num], 'distortion': val['distortion'][num], 'strength': val['strength'][num]}, ignore_index=True)
    if num%100==99:
        print(str(num+1) +' distorted item finished!')

results.to_csv('./exp/'+model_name+'_dis.csv')

# Evaluate

In [None]:
# !pip install openai
import json

from openai import OpenAI
client = OpenAI(api_key = "Your api key")
import time

def get_completion(msg):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": msg}
        ]
    )
    return completion.choices[0].message.content

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd

name='Your model name'

ans_file=pd.read_csv("Ref_Release.csv")
dis_file=pd.read_csv('exp/'+name+'_dis.csv')
ref_file=pd.read_csv('exp/'+name+'_ref.csv')

In [None]:
ans_file['result']=0
for num in range(len(ans_file)):
    question=ans_file['question'][num]
    answers=ans_file['choice'][num]
    correct_ans=ans_file['answer'][num]
    answer=dis_file['answer'][num]

    if ans_file['type'][num]=='MCQ':
        for i in range(5):
            msg = f'''You will now be provided with a question [{question}] and a set of options [{answers}] with option [{correct_ans}] being the correct answer.
            Additionally, there will be an answer [{answer}] provided by a respondent. Please determine whether the respondent's answer is correct considering the context of the question.
            Even if the word choice is not completely the same, you can decide based on the given options and see whether the one in the answer is close enough to the given correct answer
            The result is 1 if the answer is correct and else the result is 0. Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1']:
                    tmp=int(result[-1])

                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))
            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=int(ans_file.loc[num, "result"]/5+0.5)


    elif ans_file['type'][num]=='VQA':
        for i in range(5):
            msg= f'''Given the question [{question}], evaluate whether the response [{answer}] completely matches the correct answer [{correct_ans}]. 
            First, check the response and please rate score 0 if the response is not a valid answer.
            Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 
            Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
            Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
            Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1','2']:
                    tmp=int(result[-1])
                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))     

            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=ans_file.loc[num, "result"]/10

    elif ans_file['type'][num]=='CAP':
        corrects=eval(correct_ans)
        for correct in corrects:
            msg= f'''Evaluate whether the sentence [{answer}] completely matches the correct answer [{correct}]. 
            First, check the response and please rate score 0 if the response is not a valid answer.
            Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 
            Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
            Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
            Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1','2']:
                    tmp=int(result[-1])
                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))   

            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=ans_file.loc[num, "result"]/(2*len(corrects))
    else:
        print('Error:'+str(num))

    if num%20==1:
        print(num)
        time.sleep(5)

ans_file.to_csv('./result/Result_dis_'+name+'.csv')

In [None]:
ans_file['result']=0
for num in range(len(ans_file)):
    question=ans_file['question'][num]
    answers=ans_file['choice'][num]
    correct_ans=ans_file['answer'][num]
    answer=ref_file['answer'][num]

    if ans_file['type'][num]=='MCQ':
        for i in range(5):
            msg = f'''You will now be provided with a question [{question}] and a set of options [{answers}] with option [{correct_ans}] being the correct answer.
            Additionally, there will be an answer [{answer}] provided by a respondent. Please determine whether the respondent's answer is correct considering the context of the question.
            Even if the word choice is not completely the same, you can decide based on the given options and see whether the one in the answer is close enough to the given correct answer
            The result is 1 if the answer is correct and else the result is 0. Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1']:
                    tmp=int(result[-1])

                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))
            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=int(ans_file.loc[num, "result"]/5+0.5)


    elif ans_file['type'][num]=='VQA':
        for i in range(5):
            msg= f'''Given the question [{question}], evaluate whether the response [{answer}] completely matches the correct answer [{correct_ans}]. 
            First, check the response and please rate score 0 if the response is not a valid answer.
            Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 
            Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
            Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
            Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1','2']:
                    tmp=int(result[-1])
                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))     

            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=ans_file.loc[num, "result"]/10

    elif ans_file['type'][num]=='CAP':
        corrects=eval(correct_ans)
        for correct in corrects:
            msg= f'''Evaluate whether the sentence [{answer}] completely matches the correct answer [{correct}]. 
            First, check the response and please rate score 0 if the response is not a valid answer.
            Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 
            Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
            Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
            Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1','2']:
                    tmp=int(result[-1])
                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))   

            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=ans_file.loc[num, "result"]/(2*len(corrects))
    else:
        print('Error:'+str(num))

    if num%20==1:
        print(num)
        time.sleep(5)

ans_file.to_csv('./result/Result_ref_'+name+'.csv')

In [None]:
ans_file['result']=0
for num in range(len(ans_file)):
    question=ans_file['question'][num]
    answers=ans_file['choice'][num]
    correct_ans=ref_file['answer'][num]
    answer=dis_file['answer'][num]

    if ans_file['type'][num]=='MCQ':
        for i in range(5):
            msg = f'''You will now be provided with a question [{question}] and a set of options [{answers}] with option [{correct_ans}] being the correct answer.
            Additionally, there will be an answer [{answer}] provided by a respondent. Please determine whether the respondent's answer is correct considering the context of the question.
            Even if the word choice is not completely the same, you can decide based on the given options and see whether the one in the answer is close enough to the given correct answer
            The result is 1 if the answer is correct and else the result is 0. Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1']:
                    tmp=int(result[-1])

                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))
            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=int(ans_file.loc[num, "result"]/5+0.5)


    elif ans_file['type'][num]=='VQA':
        for i in range(5):
            msg= f'''Given the question [{question}], evaluate whether the response [{answer}] completely matches the correct answer [{correct_ans}]. 
            First, check the response and please rate score 0 if the response is not a valid answer.
            Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 
            Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
            Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
            Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1','2']:
                    tmp=int(result[-1])
                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))     

            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=ans_file.loc[num, "result"]/10

    elif ans_file['type'][num]=='CAP':
        for i in range(5):
            msg= f'''Evaluate whether the sentence [{answer}] completely matches the correct answer [{correct_ans}]. 
            First, check the response and please rate score 0 if the response is not a valid answer.
            Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 
            Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
            Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
            Please only provide the result in the following format: Score:'''

            try:
                result=get_completion(msg)
                if result[-1] in ['0','1','2']:
                    tmp=int(result[-1])
                else:
                    tmp=0
            except:
                tmp=0
                print('Error '+str(num))   

            ans_file.loc[num, "result"]=ans_file.loc[num, "result"]+tmp
        ans_file.loc[num, "result"]=ans_file.loc[num, "result"]/10
    else:
        print('Error:'+str(num))

    if num%20==1:
        print(num)
        time.sleep(5)

ans_file.to_csv('./result/Result_ref_'+name+'.csv')

# Show final result

In [26]:
import pandas as pd
name="llava15"
dis=pd.read_csv("result/Result_dis_"+name+".csv")
com=pd.read_csv("result/Result_com_"+name+".csv")
ref=pd.read_csv("result/Result_ref_"+name+".csv")

In [27]:
Type=[]
Type.append(dis[(dis['type'] =='MCQ') ].index.tolist())
Type.append(dis[(dis['type'] =='VQA') ].index.tolist())
Type.append(dis[(dis['type'] =='CAP') ].index.tolist())
Strength=[]
Strength.append(dis[(dis['strength'] == 1) ].index.tolist())
Strength.append(dis[(dis['strength'] == 2) ].index.tolist())
Strength.append(dis[(dis['strength'] == 3) ].index.tolist())
Step=[]
Step.append([index for index in range(len(dis)) if dis['distortion'][index] in [1,2,3,4]])
Step.append([index for index in range(len(dis)) if dis['distortion'][index] in [5,6,7,8]])
Step.append([index for index in range(len(dis)) if dis['distortion'][index] in [9,10,11,12,13]])
Step.append([index for index in range(len(dis)) if dis['distortion'][index] in [14,15,16,17,18]])
Step.append([index for index in range(len(dis)) if dis['distortion'][index] in [19,20,21,22]])
Step.append([index for index in range(len(dis)) if dis['distortion'][index] in [23,24,25,26,27]])
Step.append([index for index in range(len(dis)) if dis['distortion'][index] in [28,29,30,31,32,33]])
Group=[]
Group.append([index for index in range(len(dis)) if dis['distortion'][index] in [1,5,28]])
Group.append([index for index in range(len(dis)) if dis['distortion'][index] in [25,26,27]])
Group.append([index for index in range(len(dis)) if dis['distortion'][index] in [14,23,24,29,30,32,33]])
Group.append([index for index in range(len(dis)) if dis['distortion'][index] in [6,13,18,19,20,21,22]])
Group.append([index for index in range(len(dis)) if dis['distortion'][index] in [9,10,11,12,31]])
Group.append([index for index in range(len(dis)) if dis['distortion'][index] in [15,16,17]])
Group.append([index for index in range(len(dis)) if dis['distortion'][index] in [2,3,4,7,8]])

In [28]:
TaS=[]
TaS.append(dis[(dis['type'] =='MCQ') & (dis['strength'] == 1)].index.tolist())
TaS.append(dis[(dis['type'] =='MCQ') & (dis['strength'] == 2)].index.tolist())
TaS.append(dis[(dis['type'] =='MCQ') & (dis['strength'] == 3)].index.tolist())
TaS.append(dis[(dis['type'] =='VQA') & (dis['strength'] == 1)].index.tolist())
TaS.append(dis[(dis['type'] =='VQA') & (dis['strength'] == 2)].index.tolist())
TaS.append(dis[(dis['type'] =='VQA') & (dis['strength'] == 3)].index.tolist())
TaS.append(dis[(dis['type'] =='CAP') & (dis['strength'] == 1)].index.tolist())
TaS.append(dis[(dis['type'] =='CAP') & (dis['strength'] == 2)].index.tolist())
TaS.append(dis[(dis['type'] =='CAP') & (dis['strength'] == 3)].index.tolist())

format:<br>
Task:&emsp;&emsp;MCQ&emsp;VQA&emsp;CAP<br>
Strength:&emsp;high&emsp;mid&emsp;low<br>
Step:&emsp;&emsp;Enviro&emsp;Camera&emsp;Analog&emsp;Source&emsp;Channel&emsp;Receive&emsp;Enhance<br>
Group:&emsp;&emsp;Blur&emsp;Lumi&emsp;Chromi&emsp;Spatial&emsp;Noise&emsp;Comp&emsp;Wild<br>

In [30]:
import numpy as np
print(name)
print('Absolute Robustness')
for tmp in [Type,Strength,Step,Group]:
    for num in range(len(tmp)):
        print('%.4f' % np.mean(dis['result'][tmp[num]]),end='\t')
    print('')


llava15
Absolute Robustness
0.4364	0.5980	0.3455	
0.4679	0.4398	0.4721	
0.4024	0.4417	0.4560	0.4964	0.5067	0.4862	0.4304	
0.4459	0.4570	0.4654	0.4781	0.4600	0.5193	0.4014	


In [31]:
import numpy as np
print(name)
print('Relative Robustness')
for tmp in [Type,Strength,Step,Group]:
    for num in range(len(tmp)):
        print('%.4f' % np.mean(com['result'][tmp[num]]*ref['result'][tmp[num]]),end='\t')
    print('')

llava15
Relative Robustness
0.3758	0.4920	0.1896	
0.3774	0.3273	0.3526	
0.3309	0.3542	0.3298	0.3677	0.4094	0.3619	0.3258	
0.3658	0.3543	0.3453	0.3772	0.3431	0.3830	0.3096	
