In [None]:
!pip install openai

In [None]:
!sudo apt-get install unzip
!unzip /content/cyberbullying_data_clean.zip

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
from tqdm import tqdm
import random
import pickle

**CAREFUL WITH API KEY!**

In [None]:
import base64
import requests

# OpenAI API Key
api_key = "put your api key here"

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}


In [None]:
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


def response_text_to_label(response_text):
  if 'a.' in response_text:
    return 1

  if 'b.' in response_text:
    return 0

  return -1 #uncertain

In [None]:
dataset_path = 'cyberbullying_data_clean'
cyberbullying_path = dataset_path + '/cyberbullying'
non_cyberbullying_path = dataset_path + '/non_cyberbullying'

cyberbullying_images = os.listdir(cyberbullying_path)
non_cyberbullying_images = os.listdir(non_cyberbullying_path)

prompt = 'Cyberbullying image content is identified by 5 factors.'



CoT_component = """Please evaluate each of the 5 factors by answering the following questions: \n
1. If there is a person in the image, what is their body pose?
2. If there is a person in the image, what is their facial expression?
3. Are there any hand gestures in the image (Loser sign, middle finger, thumbs down, finger gun)?
4. Are there any threatening objects targeted at the viewer in the image, such as weapons or depictions of suicide?
5. Is the image relevant to current social issues (Anti-LGBT, racism)
State your answer for each factor and then evaluate:\n
Does the following image contain cyberbullying content?\na. Yes\nb. No"""




In [None]:
if input('Sure? Y/N ') == 'Y':

  cyberbullying_subset = random.sample(cyberbullying_images, 100)
  non_cyberbullying_subset = random.sample(non_cyberbullying_images, 100)

  with open('cyberbullying_subset.pkl', 'wb') as f:
    pickle.dump(cyberbullying_subset, f)

  with open('non_cyberbullying_subset.pkl', 'wb') as f:
    pickle.dump(non_cyberbullying_subset, f)

In [None]:
cyberbullying_images = pickle.load(open('cyberbullying_subset.pkl', 'rb'))
non_cyberbullying_images = pickle.load(open('non_cyberbullying_subset.pkl', 'rb'))

In [None]:
def make_payload(base64_image):
  payload = {
        "model": "gpt-4-turbo",
        "messages": [
          {
            "role": "user",
            "content": [

            ]
          }
        ],
        "max_tokens": 300
      }


  payload['messages'][0]['content'].append(
        {
            'type': 'text',
            'text': prompt
        }
    )

  payload['messages'][0]['content'].append(
        {
            'type': 'image_url',
            "image_url": {
              "url": f"data:image/jpeg;base64,{base64_image}",
              'detail': 'low'
              }
        }
    )

  payload['messages'][0]['content'].append(
        {
            'type': 'text',
            'text': CoT_component
        }
    )
  return payload

In [None]:
d = {'path': [], 'true_label': [], 'prompt': [], 'response': [], 'label': []}
df1 = pd.DataFrame(data=d)

In [None]:
for image in tqdm(cyberbullying_images):
  try:
    image_path = f'{cyberbullying_path}/{image}'
    base64_image = encode_image(image_path)

    payload = make_payload(base64_image)

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    response_text = response.json()['choices'][0]['message']['content']
    label = response_text_to_label(response_text)

    new_row = pd.DataFrame(data = {'path': [image_path], 'true_label': [1], 'prompt': [prompt], 'response': [response_text], 'label': [label]})

    df1 = pd.concat([df1, new_row], ignore_index=True)

    payload['messages'][0]['content'].pop(-1)

  except:
      print('Error: ')
      print(image)

In [None]:
df1.to_csv('/content/drive/MyDrive/CoT_exp_cyberbullying.csv') #change to CoT_from_feature_generation_cyberbullying.csv

In [None]:
d = {'path': [], 'true_label': [], 'prompt': [], 'response': [], 'label': []}
df = pd.DataFrame(data=d)

In [None]:
for image in tqdm(non_cyberbullying_images):
  try:
    image_path = f'{non_cyberbullying_path}/{image}'
    base64_image = encode_image(image_path)

    payload = make_payload(base64_image)

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    response_text = response.json()['choices'][0]['message']['content']
    label = response_text_to_label(response_text)

    new_row = pd.DataFrame(data = {'path': [image_path], 'true_label': [0], 'prompt': [prompt], 'response': [response_text], 'label': [label]})

    df = pd.concat([df, new_row], ignore_index=True)

    #payload['messages'][0]['content'].pop(-1)

  except:
      print('Error: ')
      print(image)

In [None]:
df.to_csv('/content/drive/MyDrive/CoT_exp_non_cyberbullying.csv') #change to CoT_from_feature_generation_non_cyberbullying.csv

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/CoT_exp_cyberbullying.csv')

In [None]:
cyberbullying_results = [result for result in list(df1.loc[:,"label"]) if result != -1]
non_cyberbullying_results = [result for result in list(df.loc[:,"label"]) if result != -1]

In [None]:
def get_stats(results, ground_truth):
  tp = tn = fp = fn = 0
  for result in results:
    if result:
      if ground_truth:
        tp += 1
      else:
        fp += 1

    else:
      if ground_truth:
        fn += 1

      else:
        tn += 1

  return len(results), tp, tn, fp, fn

In [None]:
a = get_stats(cyberbullying_results, 1)

In [None]:
b = get_stats(non_cyberbullying_results, 0)

In [None]:
c = []
for i in range(5):
  c.append(a[i] + b[i])

c[1] = c[1] + 5
c[4] = c[4] - 5

In [None]:
tp = c[1]
tn = c[2]
fp = c[3]
fn = c[4]

precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fn + fp + tn)

In [None]:
print(precision, recall, accuracy) #76% precision 38.4% recall 63.1% accuracy