In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/2024_Unstructured_Data/

/content/drive/MyDrive/2024_Unstructured_Data


# environment setting

In [3]:
!pip install -q -U "torch==2.1.2" tensorboard
!pip install -q -U "transformers==4.36.2" "datasets==2.16.1" "accelerate==0.26.1" "bitsandbytes==0.42.0"

In [4]:
!pip install -U "huggingface_hub[cli]"



In [5]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [6]:
!pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [7]:
!huggingface-cli login --token "token"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Library Import & Load Model

In [8]:
from transformers import AutoTokenizer
import transformers
import torch
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from trl import setup_chat_format


In [9]:
import warnings
warnings.filterwarnings('ignore')

# Load Dataset & Prepare Training

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [11]:
# df_miss = pd.read_csv("./data/depression_missclassification_cases.csv")
df_miss = pd.read_csv("./data/mixed_emotion_set_missclassification_cases.csv")
df_miss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   text               22 non-null     object 
 1   label              22 non-null     object 
 2   prediction         22 non-null     object 
 3   max_prob           22 non-null     float64
 4   prob_distribution  22 non-null     object 
dtypes: float64(1), object(4)
memory usage: 1008.0+ bytes


In [12]:
df_miss.head()

Unnamed: 0,text,label,prediction,max_prob,prob_distribution
0,I am a housewife in my 20s. Through early marr...,Depression,Happy,0.903991,"[0.09373518079519272, 0.002274321625009179, 0...."
1,Despite my successful career and the admiratio...,Depression,Happy,0.576257,"[0.41973501443862915, 0.004007911309599876, 0...."
2,Mom and Dad had another big fight about who ge...,Depression,Happy,0.609991,"[0.38730886578559875, 0.0027002866845577955, 0..."
3,Despite my professional success and recognitio...,Neutral,Happy,0.960916,"[0.0369863286614418, 0.002097291639074683, 0.9..."
4,As a teenager with a passion for music and dre...,Neutral,Happy,0.861305,"[0.13400466740131378, 0.004690449219197035, 0...."


In [13]:
X_train, y_train = df_miss["text"], df_miss["prediction"]

# LLM Load

In [None]:
def load_LLM_huggingface(model_name, device, compute_dtype):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  trust_remote_code=True,
                                                  )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=compute_dtype,
        device_map=device,
        # quantization_config=bnb_config,
    )
    return [tokenizer, model]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
models = ["meta-llama/Llama-2-7b-chat-hf", "meta-llama/Meta-Llama-3-8B-Instruct"] # Need login
compute_dtype = getattr(torch, "float16")

LLaMA2 = load_LLM_huggingface(models[0], device, compute_dtype)
LLaMA3 = load_LLM_huggingface(models[1], device, compute_dtype)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Inference

In [None]:
from tqdm.auto import tqdm

In [None]:
def LLaMA3_Chat_completion(model, data, label, requests):
    # model[0] == tokenizer / model[1] == model
    messages = [
        {"role": "system", "content": requests[0]},
        {"role": "user", "content": requests[1]},
    ]

    def chatting(role, content):
        messages.append({"role": role, "content": content})

    def reply():
        input_ids = model[0].apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model[1].device)

        terminators = [
            model[0].eos_token_id,
            model[0].convert_tokens_to_ids("<|eot_id|>")
        ]

        outputs = model[1].generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            pad_token_id=model[0].eos_token_id
        )
        response = outputs[0][input_ids.shape[-1]:]
        return model[0].decode(response, skip_special_tokens=True)
    # first step
    chatting("assistant", reply())
    chatting("user", requests[-1](data, label))
    chatting("assistant", reply())

    for question in requests[2:-1]:
        chatting("user", question)
        chatting("assistant", reply())
    return messages

In [None]:
def LLaMA3_parser(messages):
    result = []
    for message in messages[5:]:
        if message["role"] == "assistant":
            result.append(message["content"])
    return result

In [None]:
question_func = lambda text, label: f"Text: {text}\n\nAI_Predicted_label: {label}"
version_request = [# Version1 [Yes or no / object / percentatge]
                ["You are a psychologist.", # system role
                """We'll send you the text data and the label classified by the corresponding AI analysis. The data is a mix of one or more of the three emotions: 'Depression', 'Neutral', and 'Happy'.
                Using this data and the labels, please answer my questions""",
                "Is the label analyzed by AI right? Please answer as concisely as possible.",
                "I see, please even make an objective analysis of that.",
                """Finally, please provide the percentages of emotions in the aforementioned data in the order of 'depressed', 'neutral', and 'happy'. No other explanation is needed, just the percentages separated by commas.""",
                question_func],
                # Version2 [object / Y or N / percentage]
                ["You are a psychologist.", # system role
                """We'll send you the text data and the label classified by the corresponding AI analysis. The data is a mix of one or more of the three emotions: 'Depression', 'Neutral', and 'Happy'.
                Using this data and the labels, please answer my questions""",
                "Perform an objective analysis of that data"
                "Does your analysis match the labels predicted by the AI?",
                """Finally, please provide the percentages of emotions in the aforementioned data in the order of 'depressed', 'neutral', and 'happy'. No other explanation is needed, just the percentages separated by commas.""",
                question_func],
                # Version3 [object / Y or N / percentage] + Role Change and help me analyse
                ["You're a psychotherapist. Help me analyze", # system role
                """We'll send you the text data and the label classified by the corresponding AI analysis. The data is a mix of one or more of the three emotions: 'Depression', 'Neutral', and 'Happy'.
                Using this data and the labels, please answer my questions""",
                "Perform an objective analysis of that data"
                "Does your analysis match the labels predicted by the AI?",
                """Finally, please provide the percentages of emotions in the aforementioned data in the order of 'depressed', 'neutral', and 'happy'. No other explanation is needed, just the percentages separated by commas.""",
                question_func]
                ]

In [None]:
# Version running
for version in range(1,4):
    a=[]
    b=[]
    requests = version_request[version-1]
    for X, y in tqdm(zip(X_train.values, y_train), desc = "Generate Reasoning"):
        a.append(LLaMA3_parser(LLaMA3_Chat_completion(LLaMA2, X, y, requests)))
        b.append(LLaMA3_parser(LLaMA3_Chat_completion(LLaMA3, X, y, requests)))
    answers1 = []
    answers2 = []
    answers3 = []
    for item in a:
        answers1.append("-")
        answers2.append("-")
        answers3.append("-")
        answers1.append(item[0])
        answers2.append(item[1])
        answers3.append(item[2])
    answers4 = []
    answers5 = []
    answers6 = []
    for item in b:
        answers4.append(item[0])
        answers5.append(item[1])
        answers6.append(item[2])
    df1 = pd.concat([X_train.reset_index(drop=True), df_miss[["label", "prediction"]]], axis = 1)
    df2 = pd.DataFrame({"LLaMA2_1": answers1, "LLaMA2_2": answers2, "LLaMA2_3": answers3,
                        "LLaMA3_1": answers4, "LLaMA3_2": answers5, "LLaMA3_3": answers6})
    df = pd.concat([df1, df2], axis = 1)
    df.to_csv(f"./data/log/result_ver3_0524_mixed_miscalssification_ver{version}_real.csv",index = False)