### Install the model:
`! pip install ollama llama-index-llms-ollama`

`! sudo snap install ollama`

`! ollama pull llama3`

### After downloading the base model and creating the modelfile, we create the parametrised model for our task:
`! ollama create [model name] -f [modelfile name]`

In [1]:
from llama_index.llms.ollama import Ollama
from lib.dataset_utils import load_twitter_data_cleaned, load_goemotions_cleaned
from lib.plot_utils import *
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.metrics import accuracy_score, jaccard_score, f1_score, classification_report
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import re


### Loading Twitter

In [2]:
_, _, twitter_test = load_twitter_data_cleaned() 
twitter_emotions = ('joy', 'sadness','anger', 'fear', 'love', 'surprise')

### Loading Goemotions

In [3]:
import json
label_mapping_path = "./dataset/GoEmotionsSplit/label_mapping.json"
_, _, goemotions_test = load_goemotions_cleaned()
json1_file = open(label_mapping_path)
json1_str = json1_file.read()
json1_data = json.loads(json1_str)
goemotions_emotions = str(json1_data.values())

In [4]:
def flatten(xss):
    # flattens list of lists into a single list
    return [x for xs in xss for x in xs]

def accuracy(targets, predictions):
    return accuracy_score(targets, predictions)
def jaccard(targets, predictions):
    return jaccard_score(targets, predictions, average='micro', zero_division=0)
def jaccard_samples(targets, predictions):
    return jaccard_score(targets, predictions, average='samples', zero_division=0)
def f1(targets, predictions):
    return f1_score(targets, predictions, average='macro', zero_division=0)
def f1_micro(targets, predictions):
    return f1_score(targets, predictions, average='micro', zero_division=0)

SCORES = {"accuracy": accuracy, "jaccard": jaccard, "jaccard_samples":jaccard_samples, "f1": f1, "f1_micro": f1_micro}

SAMPLES = """ 
text: 1. i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived
2. im updating my blog because i feel shitty

answer:{
    "1": "joy"
    "2": "sadness"
    }
"""
SINGLE_BASE_PROMPT = """<|start_header_id|>system<|end_header_id|> Classify the sentences. Choose ONLY ONE EMOTION among the following: """ 

MULTI_BASE_PROMPT = """<|start_header_id|>system<|end_header_id|> Classify the sentences. Choose a maximum of three emotions among the following: """ 

SAMPLES_STRING = """Here are some samples:"""

TERMINATOR_STRING = """<|eot_id|>"""

CUSTOM_PROMPT = "1. i don t follow too many people and i don t have too many followers however i have a feeling that the people that i am talking about may know who they are i m not trying to be rude i m just being real"

In [5]:
class Llama3():
    def __init__(self, name, timeout = 1000.0, scores={}, json_mode =True):
        self.model = Ollama(model=name, request_timeout=timeout, json_mode = json_mode)
        self.scores = scores

    def predict(self, emotions, test, samples = None, batch_dim = 8, single_label = True, progress_bar = False):
        # emotions : emotions to classify
        # test : test data to classify
        # samples : samples to show in prompt (to be taken from training data?), for now are hardcoded
        test_loader = DataLoader(test, batch_size = batch_dim, shuffle = False)
        predictions = []
        base_prompt = self.generate_base_prompt(emotions, samples, single_label) # to avoid recreating the prompt from scratch at every batch
        for data in tqdm(test_loader, disable=not progress_bar):
            batch_prompt = self.add_test_data_to_prompt(base_prompt, data[0])
            predictions.append(self.classify_batch(batch_prompt, emotions, batch_dim))
        predictions = flatten(predictions)
        if len(predictions) == len(test):
            print(f"Error: predictions and test data do not match: pred: {len(predictions)} vs test:{len(test)}")

        results = self.evaluate(test.targets, predictions, self.scores, emotions)
        print(results)
        return predictions, results

    def generate_base_prompt(self, emotions, samples, single_label = True):
        prompt = ""
        if single_label:
            prompt = SINGLE_BASE_PROMPT + str(emotions) 
        else:
            prompt = MULTI_BASE_PROMPT + str(emotions) 
        if samples:
            prompt += SAMPLES_STRING + samples 
        return prompt + TERMINATOR_STRING
                
    def add_test_data_to_prompt(self, prompt, test):
        # appends data to classify to base prompt
        for index, row in enumerate(test):
            prompt += (str(index) + '. ' + row + '\n')
        prompt += TERMINATOR_STRING
        return prompt

    def classify_batch(self, prompt, emotions, batch_dim):
        # classify batch of data
        # response is formatted as JSON
        response = self.model.complete(prompt).text
        predictions = self.extract_emotions(response, emotions, batch_dim)
        return list(predictions.values())
    
    def evaluate(self, targets, predictions, scores, emotions, single_label = True):
        # evaluate the model
        if single_label:
            lb = LabelBinarizer()
        else:
            lb = MultiLabelBinarizer()
        bin_predictions = lb.fit_transform(predictions)
        bin_predictions = pd.DataFrame(bin_predictions, columns = lb.classes_) 
        if 'other' in bin_predictions.columns:
            bin_predictions.drop(columns=['other'], inplace=True) # 'other' if emotion not in the allowed ones
        scores = {name: score(targets, bin_predictions) for name, score in scores.items()}
        plot_score_barplot(targets, bin_predictions, emotions)
        print(classification_report(targets, bin_predictions, target_names=emotions))
        if not single_label:
            plot_multilabel_confusion_heatmap(targets, bin_predictions, emotions)
        return scores

    def clean_response(self, response):
        response = response[response.find('{'):] # skips occasional text before JSON
        result = ""
        for line in response.splitlines():
            head, sep, _ = line.partition(',') # removes occasional "comments" model sometimes adds
            result += head + sep + '\n'
        return result 
    
    def extract_emotions(self, answers, emotions, batch_dim):
        # extracts emotions from JSON response 
        answers_dict = json.loads(answers)
        if len(answers_dict) != batch_dim:
            # pad 
            answers_dict.update({i: 'other' for i in range(batch_dim-len(answers_dict))})
        for key, value in answers_dict.items():
            if value not in emotions:
                answers_dict.update({key : 'other'})
        return answers_dict


class Llama_EmotionsData(Dataset):
    def __init__(self, dataframe) -> None:
        self.text = dataframe['text']
        self.targets = dataframe.drop(columns=['text']).to_numpy()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        return self.text[index], self.targets[index]

# Twitter 2-Shot

In [6]:
llama3 = Llama3("ParametrisedLlama3", scores = SCORES, json_mode=True)
twitter_test_dataset = Llama_EmotionsData(twitter_test)
predictions, results = llama3.predict(twitter_emotions, twitter_test_dataset, samples = SAMPLES, single_label = True, progress_bar = True)

  0%|          | 1/250 [01:21<5:36:25, 81.06s/it]


KeyboardInterrupt: 