In [189]:
from transformers import LlamaForSequenceClassification, LlamaTokenizer, TrainingArguments, Trainer
import numpy as np
import os
import zipfile
import json

import csv
import io

import requests
import base64
from io import BytesIO
from PIL import Image
import pandas as pd
import random
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [104]:
folder_path = '/Users/hwatervalley/UiS/DAT_550_datamining/TiToHeDAT550/data/CT23_1A_checkworthy_multimodal_english_v2'
zip_file_path = '/Users/hwatervalley/UiS/DAT_550_datamining/TiToHeDAT550/dataCT23_1A_checkworthy_multimodal_english_v2.zip'

def zip_extration(folder_path, zip_file_path):
    print('Zip file extraction started')
    if not os.path.exists(folder_path):
        print('Folder does not exist, extracting zip file')
        os.makedirs(folder_path)

        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(folder_path)
    
    print('Zip file extracted')

zip_extration(folder_path, zip_file_path)

train_path = folder_path + '/CT23_1A_checkworthy_multimodal_english_dev_test.jsonl'
test_path = folder_path + '/CT23_1A_checkworthy_multimodal_english_test.jsonl'

def split_json(data):
    text_data = {
        'tweet_id': data['tweet_id'],
        'tweet_url': data['tweet_url'],
        'text': data['tweet_text'] + data['ocr_text'],
        'class_label': data['class_label']
    }

    image_data = {
        'tweet_id': data['tweet_id'],
        'tweet_url': data['tweet_url'],
        'class_label': data['class_label'],
        'image_path': data['image_path'],
        'image_url': data['image_url']
    }

    return text_data, image_data


# Read data from the folder
def read_data(file_path):
    text_data = []
    image_data = []
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            text, image = split_json(json_obj)
            text_data.append(text)
            image_data.append(image)
    return text_data, image_data

train_text_data, train_image_data = read_data(train_path)
#test_text_data, train_image_data = read_data(test_path)
print(f'Text: {train_text_data[0]}')
print(f'Image: {train_image_data[0]}')


Zip file extraction started
Zip file extracted
Text: {'tweet_id': '1032686443846610944', 'tweet_url': 'https://twitter.com/user/status/1032686443846610944', 'text': 'What Is The Albedo Effect? Find out here: https://t.co/aULoitMY50\n#climatechange #climateaction \n#environment \n#energy https://t.co/XyfhOORoCw"Albedo effect: As the Arctic loses\nsnow & ice, bare rock and water\nabsorb more of the sun\'s energy,\nmaking it even warmer\'. -WWE\nPhoto. Anne-Maria Yritys. LeviFinland Httpsv/www.leadingwithpassion.org\n', 'class_label': 'No'}
Image: {'tweet_id': '1032686443846610944', 'tweet_url': 'https://twitter.com/user/status/1032686443846610944', 'class_label': 'No', 'image_path': 'images_labeled/dev_test/1032686443846610944.jpg', 'image_url': 'http://pbs.twimg.com/media/DlTW2O_WsAUufZ1.jpg'}


In [9]:
#get the relevant data pairs
def format_dataset(train_text_data):
    formatted_data = []
    for item in train_text_data:
        tweet_id = item['tweet_id']
        text = item['text']
        label = item['class_label']
        
        formatted_data.append({'tweet_id':tweet_id, 'text': text, 'label': label})
    
    return formatted_data

formatted_dataset = format_dataset(train_text_data)
print(formatted_dataset[0])
len(formatted_dataset)

# Writing formatted_dataset to a CSV file
csv_file_path = '/Users/hwatervalley/UiS/DAT_550_datamining/tweetid_text_label.csv'
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['tweet_id','text', 'label'])
    writer.writeheader()
    for data in formatted_dataset:
        writer.writerow(data)


{'tweet_id': '1032686443846610944', 'text': 'What Is The Albedo Effect? Find out here: https://t.co/aULoitMY50\n#climatechange #climateaction \n#environment \n#energy https://t.co/XyfhOORoCw"Albedo effect: As the Arctic loses\nsnow & ice, bare rock and water\nabsorb more of the sun\'s energy,\nmaking it even warmer\'. -WWE\nPhoto. Anne-Maria Yritys. LeviFinland Httpsv/www.leadingwithpassion.org\n', 'label': 'No'}


In [35]:
#Need to format the images into base64 encoding in order for the llava model to be able to describe them
folder_path = '/Users/hwatervalley/UiS/DAT_550_datamining/TiToHeDAT550/data/CT23_1A_checkworthy_multimodal_english_v2/images_labeled/dev_test'

#Changed size to reduce lenth of encoding
new_size = (44, 44) 

data = []
files = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]

for filename in files:
    file_path = os.path.join(folder_path, filename)
    with Image.open(file_path) as image:
        #rezise and use LANCZOS to keep quality
        image = image.resize(new_size, Image.LANCZOS) 
        #BytesIO in order to convert image to bytes
        img_byte_arr = io.BytesIO()
        #default to PNG if format is none, jpeg had an error
        image_format = image.format if image.format is not None else 'PNG'
        image.save(img_byte_arr, format=image_format)
        #perform encoding
        img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')

        data.append({'filename': filename, 'base64': img_base64})

base64_df = pd.DataFrame(data)
base64_df.to_csv('~/Downloads/TiToHe_base64.csv', index=False)


In [25]:
data

[{'filename': '1033026924472201217.jpg',
  'base64': 'iVBORw0KGgoAAAANSUhEUgAAACwAAAAsCAIAAACR5s1WAAAQiUlEQVR4nG1WR68lx3mt3DncHF6aN5lDDilyOBRFCiIlQrIsyUGwDMOGl14b/hveeuPf4LXhjeGlIAkWlRiGM+SEl++7sXOs5MV7MyQlV6ML1VVAf6dPn3Oq4D/9y7/OVpmU+up232BsOvDHg1AptU6qrGhcxzg+z7jUe5OOaxl5I6OC2ybjAv7q909/+Z//rqqZ1EArDYDWAACtAQQAAK21Bn/cIIR/OgeARsfnsZCSIPj5wVwD3Qns+SrRSk0Gnm3SOKu3hh5GehnlVdt6JnENIqWyqH779Rt3vv23gPgYghcvhxACDfXXi8D/H4F+0aN1kmMAru30xn3/4HT96ZOZkJJLmeb1eOBZFCGopn0vK+pVXBZ12/cNDAFXwGbye++9c+XeXwPIEELgeTHwHAUE+nmlF2W/JEjrLxGiD966yRj59PEMIzDuueuk/M2nxw+fLX/10cF8mU5HvmcbBKNR10mSYpNkedWMQksI0bQysOQH3/9+/+YHEECMyfNvvew1gBBACCAA8GIJ6i/5gBACoC7GZNzzxv3w8GyDoF4lJQSoF9i+zbRScV47NuuFrsFagiHncpNUnGs8gls9+3RTCqkmHfLBj3/2X+mymP1GI6SVhBAADaHWCkAAAdDwBR8QIP2CGw0ARBdL8P2/+udXb+7cvDpljB7PIikVF7rf9TBGVSMRBP3QEVJ2PHO2LlZxnRaNYbD9aZcyY5nUFGMOyB8ezf7nP/5NZM+kAlqrS2FqrSGAF2X/VJH6y0m0NQwfHsz/++efHJ+tuoEzHXWv7vSzommFtEyCIJJKPz2NP3m2HHTsUcemGHEhjxdJXVd93wRAA8Vfuja5/4N/BLSLEfi6SiGAEEEI/1geX4OFGCX3Xt6rW/7z334+m68+en

In [38]:
#requires a lot if time, evaluate to split up base64_images
def describe_images(base64_images):
    #API endpoint
    url = "http://localhost:11434/api/generate"
    
    descriptions = []
    
    #content type is JSON
    headers = {'Content-Type': 'application/json'}
    
    for string in base64_images:
        base64_string = string['base64']
        filename = string['filename']
        data = json.dumps({
            "model": "llava",
            "prompt": "What is in this picture?",
            "stream": False,
            "images": [base64_string]
        })
        
        # Send POST request
        response = requests.post(url, headers=headers, data=data)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the response JSON and store the description
            result = response.json()
            descriptions.append({'filename':filename,'description':result.get('response')})
        else:
            # Handle possible HTTP errors
            descriptions.append({'filename':filename,'description':f"Failed to retrieve description: HTTP {response.status_code}"})
    
    return descriptions


image_descriptions = describe_images(data)
descriptions = pd.DataFrame(image_descriptions)
descriptions.to_csv('~/Downloads/TiToHe_descriptions_rest.csv')


In [37]:
image_descriptions

[{'filename': '1033026924472201217.jpg',
  'description': ' This is an image of Earth taken from space. The photo captures the planet with its oceans, landmasses, and atmosphere visible. It provides a perspective that we do not often see from our daily lives on Earth, showcasing the vastness of our home planet and how small it appears when viewed from space. '},
 {'filename': '1033687781468303361.jpg',
  'description': " The picture shows a sky with multiple contrails, which are the visible trails left by aircraft as they pass through the Earth's atmosphere. These lines of condensed water vapor form in the high-altitude jet streams behind an aircraft and can spread out to create complex patterns. "},
 {'filename': '1032891600425418754.jpg',
  'description': " The image shows a stylized map of the United States with various areas highlighted in different colors. The map uses a color scale to represent some sort of data, which appears to be temperature or weather conditions, given the su

In [57]:
tweets = {'filename': [item['tweet_id'] for item in train_text_data],
          'tweet': [item['text'] for item in train_text_data],
          'label': [item['class_label'] for item in train_text_data]
         }
tweets

{'filename': ['1032686443846610944',
  '1032687965372403712',
  '1032694287757176832',
  '1032758725499662337',
  '1032822100120285184',
  '1032948204239773696',
  '1032988020457254914',
  '1033007295796408320',
  '1033085428998389760',
  '1033191981411840000',
  '1033373834488045569',
  '1033386922784239616',
  '1033463814254157825',
  '1033477968612470784',
  '1033768608432889856',
  '1033788818246303747',
  '1033813920958558209',
  '1034100633551429632',
  '1034216798533476353',
  '1034269438646931458',
  '1034300820542578693',
  '1034463742824771584',
  '1034520308638732288',
  '1034636350778208256',
  '1034728664418664448',
  '1034818127702286336',
  '1034891246513336322',
  '1035028296101027840',
  '1035044040142741505',
  '1035147129541423104',
  '1035199548560482305',
  '1035200496170983428',
  '1035241197764325376',
  '1035248852427853824',
  '1035276746180898816',
  '1035284656701222914',
  '1126253138569199616',
  '1132767669348970499',
  '1132859007977316352',
  '1135072337

In [58]:
len(tweets['label'])

548

In [66]:
#Use same function to determine model label for tweet text, with contro llama3 model
def describe_tweets(tweets):
    # URL of the API endpoint
    url = "http://localhost:11434/api/generate"
    
    descriptions = []
    
    #content type is JSON
    headers = {'Content-Type': 'application/json'}
    
    # Iterate over each base64 image in the list
    for i in range(len(tweets['label'])):  # Ensures that we do not exceed the number of items or 3000
        filename = tweets['filename'][i]
        tweet = tweets['tweet'][i]
        label = tweets['label'][i]
    
        #payload with the data to send
        data = json.dumps({
            "model": "contro",
            "prompt": tweet.strip(),
            "stream": False
        })
        
        response = requests.post(url, headers=headers, data=data)
        
        if response.status_code == 200:
            result = response.json()
            descriptions.append({'filename':filename,'model_label':result.get('response')})
        else:
            descriptions.append({'filename':filename,'model_label':f"Failed to retrieve description: HTTP {response.status_code}"})
    
    return descriptions


model_tweet = describe_tweets(tweets)
model_tweet_label = pd.DataFrame(model_tweet)
model_tweet_label.to_csv('~/Downloads/TiToHe_model_tweet_labels.csv')


In [71]:
for key in image_descriptions:
    print(key['description'])

 The image shows a photograph of the Earth from space, taken by an astronaut aboard the International Space Station (ISS). You can see the curvature of the planet as viewed from orbit. The perspective is such that the horizon is visible with the landmasses forming the darker areas near the edge, and the atmosphere and clouds giving the lighter, more colorful regions in the center. This type of image provides a unique view of our home planet. 
 The picture shows a sky with contrails from an airplane. These condensation trails are visible at high altitudes and typically dissipate as they mix with the atmosphere. They appear to be crisscrossing in this image, suggesting that multiple aircraft were flying in the same area or during the same time frame. The background is slightly blurred due to the long exposure used to capture the photo, which emphasizes the contrails. 
 The image shows a map of the United States with a color gradient overlaid on it. This gradient is likely to represent so

In [74]:
#Use same function to determine model label for image, with contro llama3 model
def describe_tweets(image_descriptions):
    # URL of the API endpoint
    url = "http://localhost:11434/api/generate"
    
    descriptions = []
    
    #content type is JSON
    headers = {'Content-Type': 'application/json'}
    
    #iterate over all image descriptions
    for key in image_descriptions:  
        filename = key['filename']
        image = key['description']
    
        #payload with the data to send
        data = json.dumps({
            "model": "contro",
            "prompt": image.strip(),
            "stream": False
        })
        
        response = requests.post(url, headers=headers, data=data)
        
        if response.status_code == 200:
            result = response.json()
            descriptions.append({'filename':filename,'model_label':result.get('response')})
        else:
            descriptions.append({'filename':filename,'model_label':f"Failed to retrieve description: HTTP {response.status_code}"})
    
    return descriptions


model_image = describe_tweets(image_descriptions)
model_image_label = pd.DataFrame(model_image)
model_image_label.to_csv('~/Downloads/TiToHe_model_image_labels.csv')


In [76]:
for response in model_tweet[:1]:
    print(response)

{'filename': '1032686443846610944', 'model_label': 'no'}


In [78]:
#correction of responses, sentences were changed to Yes due to the responses being subjects the model did not want to respond to
def correction(model_tweet_label):
    corrected = []
    for response in model_tweet_label:
        filename = response['filename']
        if response['model_label'].lower() == 'no':
            corrected.append({'filename':filename,'model_label':'No'})
        elif response['model_label'].lower() == 'yes':
            corrected.append({'filename':filename,'model_label':'Yes'})
        else:
            corrected.append({'filename':filename,'model_label':'Yes'})
    return corrected
tweet_label = correction(model_tweet)
image_label = correction(model_image)

In [80]:
label = [item['class_label'] for item in train_text_data]

['No',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',


In [94]:
mod_tweet_labels = [item['model_label'] for item in tweet_label]
mod_image_labels = [item['model_label'] for item in image_label]


In [97]:
pred_df = pd.DataFrame({'Model_tweet': mod_tweet_labels,'Model_image':mod_image_labels,'Label':label})
pred_df

Unnamed: 0,Model_tweet,Model_image,Label
0,No,No,No
1,Yes,No,Yes
2,Yes,No,No
3,Yes,No,No
4,No,No,Yes
...,...,...,...
543,No,No,No
544,No,No,No
545,Yes,No,No
546,No,No,No


In [99]:
#create list of combined model responses, if one or more is yes then combined predictions is yes
Model_combined = pred_df.apply(lambda row: "Yes" if "Yes" in [row['Model_tweet'], row['Model_image']] else "No", axis=1).tolist()
pred_df = pd.DataFrame({'Model_tweet': mod_tweet_labels,'Model_image':mod_image_labels,'Model_combined':Model_combined,'Label':label})
pred_df



Unnamed: 0,Model_tweet,Model_image,Model_combined,Label
0,No,No,No,No
1,Yes,No,Yes,Yes
2,Yes,No,Yes,No
3,Yes,No,Yes,No
4,No,No,No,Yes
...,...,...,...,...
543,No,No,No,No
544,No,No,No,No
545,Yes,No,Yes,No
546,No,No,No,No


In [190]:
accuracy = accuracy_score(label, Model_combined)
precision = precision_score(label, Model_combined, pos_label='Yes')
recall = recall_score(label, Model_combined, pos_label='Yes')
f1 = f1_score(label, Model_combined, pos_label='Yes')


In [191]:
print("accuracy",accuracy)
print("precision",precision)
print("recall",recall)
print("f1",f1)

accuracy 0.6496350364963503
precision 0.45161290322580644
recall 0.4827586206896552
f1 0.4666666666666667


In [131]:
n_bootstrap_samples = 100
for _ in range(n_bootstrap_samples):
    bootstrap_sample = [random.choice(train_text_data) for _ in range(100)]

In [110]:
tweets_df = pd.DataFrame(tweets)

In [123]:
#remove ".jpg" from filename to merge dataframes
descriptions['filename'] = descriptions['filename'].str.replace('.jpg', '', case=False, regex=True)

In [122]:
complete_df = pd.merge(tweets_df, descriptions, on='filename', how='outer')
complete_df

Unnamed: 0,filename,tweet,label,description
0,1032686443846610944,What Is The Albedo Effect? Find out here: http...,No,The image shows a poster with text and an ill...
1,1032687965372403712,"""Big oil asks government to protect its Texas ...",Yes,The image shows a comic strip with three pane...
2,1032694287757176832,Why is climate change a global problem? https:...,No,The image shows a sunset over a body of water...
3,1032758725499662337,How can greenhouse gases be removed from the a...,No,The image shows a sunset or sunrise with the ...
4,1032822100120285184,"""Paris Agreement 2015/Art.2.1(c): Making finan...",Yes,The image shows a tranquil scene at sunset or...
...,...,...,...,...
543,1246044667025567744,Fintech Apps Offer Financial First Aid For Har...,No,The image shows a glass door with signs indic...
544,1246352826709594112,"Bitcoin Dips Below 6,741.3 Level, Down 5% - Ya...",No,"In the image, you can see a person standing i..."
545,1246665764410556420,True fact: while cyberpunk depicted possible f...,No,"The image shows a book titled ""The World of D..."
546,1250361549555892224,Easy game easy life wallah mobile game but som...,No,"The image shows two stylized characters, poss..."


In [173]:
def get_label(list):
    # URL of the API endpoint
    url = "http://localhost:11434/api/generate"
    
    #list for labels
    label = []
    
    # Headers to specify that the content type is JSON
    headers = {'Content-Type': 'application/json'}

    #for tweets
    for i in range(len(list)): 
        tweet = list[i]
    
        #payload with the data to send
        data = json.dumps({
            "model": "contro",
            "prompt": tweet.strip(),
            "stream": False
        })
        
        response = requests.post(url, headers=headers, data=data)
        
        if response.status_code == 200:
            result = response.json()
            label.append(result.get('response'))
        else:
            label.append(f"Failed to retrieve description: HTTP {response.status_code}")
            
    return label

In [174]:
#correction of responses, sentences were changed to Yes due to the responses being subjects the model did not want to respond to
def correction(model_tweet_label):
    corrected = []
    for response in model_tweet_label:
        if response.lower() == 'no':
            corrected.append('No')
        elif response.lower() == 'yes':
            corrected.append('Yes')
        else:
            corrected.append('Yes')
    return corrected


In [192]:
def pipeline(bootstrap_sample):
    #set for faster lookup
    tweet_ids = set([item['tweet_id'] for item in bootstrap_sample])

    # Filter the DataFrame
    filtered_df = complete_df[complete_df['filename'].isin(tweet_ids)]

    tweets = filtered_df['tweet']
    tweets = tweets.to_list()
    descriptions = filtered_df['description']
    descriptions = descriptions.to_list()
    labels = filtered_df['label']
    labels = labels.to_list()

    #lists for labels
    tweet_label = get_label(tweets)
    image_label = get_label(descriptions)

    #correction
    tweet_label = correction(tweet_label)
    image_label = correction(image_label)
    #print(len(tweet_label),len(image_label),len(labels))

    #matrices
    pred_df = pd.DataFrame({'Model_tweet': tweet_label,'Model_image':image_label,'Label':labels})
    #create list of combined model responses, if one or more is yes then combined predictions is yes
    Model_combined = pred_df.apply(lambda row: "Yes" if "Yes" in [row['Model_tweet'], row['Model_image']] else "No", axis=1).tolist()

    accuracy = accuracy_score(labels, Model_combined)
    precision = precision_score(labels, Model_combined, pos_label='Yes')
    recall = recall_score(labels, Model_combined, pos_label='Yes')
    f1 = f1_score(labels, Model_combined, pos_label='Yes')

    
    return accuracy, precision, recall, f1

In [193]:
pipeline(bootstrap_sample)

(0.7252747252747253, 0.391304347826087, 0.45, 0.4186046511627907)

In [194]:
#store the results
accuracy = []
precision = []
recall = [] 
f1_list = []

In [197]:
#run multiple times
n_bootstrap_samples = 100
for _ in range(20):
    bootstrap_sample = [random.choice(train_text_data) for _ in range(100)]

    ac, pre, rec, f1 = pipeline(bootstrap_sample)
    print(ac,pre,rec,f1)

    accuracy.append(ac)
    precision.append(pre)
    recall.append(rec)
    f1_list.append(f1)

0.6853932584269663 0.5161290322580645 0.5517241379310345 0.5333333333333333
0.6947368421052632 0.5714285714285714 0.5882352941176471 0.5797101449275363
0.7216494845360825 0.48148148148148145 0.5 0.49056603773584906
0.5795454545454546 0.30303030303030304 0.4166666666666667 0.3508771929824562
0.5869565217391305 0.42424242424242425 0.42424242424242425 0.4242424242424243
0.6263736263736264 0.4523809523809524 0.6333333333333333 0.5277777777777778
0.6739130434782609 0.53125 0.53125 0.53125
0.6702127659574468 0.42424242424242425 0.5384615384615384 0.47457627118644075
0.6022727272727273 0.3333333333333333 0.2962962962962963 0.3137254901960785
0.6136363636363636 0.5151515151515151 0.4857142857142857 0.5
0.6179775280898876 0.3076923076923077 0.3333333333333333 0.32
0.6129032258064516 0.41379310344827586 0.3870967741935484 0.39999999999999997
0.6777777777777778 0.37037037037037035 0.45454545454545453 0.40816326530612246
0.5714285714285714 0.37142857142857144 0.43333333333333335 0.4
0.692307692307

[0.6703296703296703, 0.6555555555555556, 0.5730337078651685]

In [200]:
metrics_data = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Mean': [
        np.mean(accuracy),
        np.mean(precision),
        np.mean(recall),
        np.mean(f1_list)
    ],
    'Standard Deviation': [
        np.std(accuracy, ddof=1),
        np.std(precision, ddof=1),
        np.std(recall, ddof=1),
        np.std(f1_list, ddof=1)
    ]
}

# Create a DataFrame
metrics_df = pd.DataFrame(metrics_data)
metrics_df.to_csv('~/Downloads/TiToHe_metrics_20.csv', index=False)