In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from sklearn.metrics import normalized_mutual_info_score
from huggingface_hub import login

import asyncio
import tensorflow as tf
import json
from hugchat import hugchat

load_dotenv()
LLAMA = os.getenv("LLAMA")
login(token=LLAMA)

#get the cookies for accessing huggingchat
base_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
json_file_path = os.path.join(base_path, 'cookies.json')
with open(json_file_path, 'r') as file:
    cookies = json.load(file)


seven_newsgroups_data = pd.read_csv('fetch_7newsgroups.csv')

def split_train_test(data):
    data = data.dropna(how="any")
    train, test = train_test_split(data, test_size=0.05, random_state=42)

    train = train.sort_values(by='label')
    train_first_doc = train.groupby('label').first().reset_index()

    train_first_doc['div'] = 'train'
    test['div'] = 'test'

    return test, train_first_doc

test_df, instruction_df = split_train_test(seven_newsgroups_data)
test_df["name"] = range(1, len(test_df) + 1)
print(test_df.iloc[5])
print("test Length: ",len(test_df))
print("instruction Length: ", len(instruction_df))
print("original length: ", len(seven_newsgroups_data))


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0                                                    5545
text             From: sp1henhj@edit (Henrik Balthazar Hjort)\n...
label20                                                          5
labels_name20                                       comp.windows.x
label                                                   Technology
div                                                           test
name                                                             6
Name: 5545, dtype: object
test Length:  893
instruction Length:  7
original length:  18846


In [None]:
def make_prompts(bbc_instructions, bbc_data):
    prompts = []
    
    general_instruction = (
        "You are a perfect topic modeling machine. Given a text and the different topics, "
        "you will classify the texts to the correct topic. First you will receive the topics, "
        "afterwards an example and finally the text you have to assign one of the before mentioned topics to."
    )
    topics = "The topics are sport, technology, politics, vehicles, religion, miscellaneous and science. Please make sure, you know the topics and their meaning."
    transition_to_examples = "Now an example for each of the categories will follow."
    transition_to_text_to_classify = (
        "Now the text, you have to classify will follow. Please assess its topic and answer only the topic of it."
    )

    for _, test_row in bbc_data.iterrows():
        prompt = general_instruction + "\n" + topics + "\n" + transition_to_examples + "\n"

        for _, instruction_row in bbc_instructions.iterrows():
            category = instruction_row['label']
            example_text = instruction_row['text']
            prompt += f"For the following text: \n{example_text}\nThe correct answer would be: {category}\n"

        text_to_classify = test_row['text']
        prompt += transition_to_text_to_classify + "\n" + text_to_classify + "\n"
        name = f"{test_row['name']}"
        prompt_dict = {}
        prompt_dict[name] = prompt
        prompts.append(prompt_dict)
    return prompts            

prompts = make_prompts(instruction_df, test_df)
print(len(prompts))

893


In [None]:
async def huggingchat(prompt):
    semaphore = asyncio.Semaphore(5)
      
    async with semaphore:
        chatbot = hugchat.ChatBot(cookies)  
        name = list(prompt.keys())[0]
        promptAI = prompt[name]
        id = chatbot.new_conversation()
        chatbot.change_conversation(id)

        answer_from_chatbot = chatbot.chat(promptAI)
        try:
            answer: str = answer_from_chatbot['text']
        except:
            answer = ""
        return {name: answer}

In [None]:

#making sure the prompting rate does not exceed the rate limit of the API
async def prompt_orchestrator(prompts):
    batch_size = 5
    avatiables = []
    batches = [prompts[i:i + batch_size] for i in range(0, len(prompts), batch_size)]

    
    if len(batches[-1]) < batch_size:
        remaining = len(batches[-1])
        batches[-1] = prompts[-remaining:]

    
    if len(prompts) % batch_size != 0:
        remaining = len(prompts) % batch_size
        batches[-1] = prompts[-remaining:]

    max_retries = 5  
    retry_count = 0  
    current_batch_index = 0  

    while retry_count <= max_retries:
        try:
            counter = current_batch_index * batch_size  
            for batch_index in range(current_batch_index, len(batches)):
                batch = batches[batch_index]
                avatiables_batch = []

                for prompt in batch:
                    response = await huggingchat(prompt)
                    avatiables_batch.append(response)
                    await asyncio.sleep(6)  

                avatiables.extend(avatiables_batch)
                counter += batch_size
                await asyncio.sleep(8)  
                print(f"{counter} prompts processed.")
                current_batch_index = batch_index + 1  
                retry_count = 0  

            break  

        except Exception as e:
            retry_count += 1
            if retry_count > max_retries:
                print(f"Failed after {max_retries} retries. Returning the accumulated results.")
                return avatiables
            print(f"An error occurred: {e}. Retrying in 15 seconds... (Attempt {retry_count}/{max_retries})")
            await asyncio.sleep(15)  

    return avatiables

results = await prompt_orchestrator(prompts)

print(results)

In [5]:
def add_correct_anser(results, test_df):
    result_with_correct_answer = []

    for result in results:
        for key, value in result.items():
            value = value.replace("\n", "")
            value = value.replace(" \n", "")
            value = value.replace(" ","")
            value = value.replace("*","")
            value = value.lower()
            matching_rows = test_df.loc[test_df['name'] == int(key), 'label']
            if not matching_rows.empty:
                if type(matching_rows.values[0]) == float: 
                    print(f"Nan value", matching_rows.values[0])
                else:
                    category_value = matching_rows.values[0]
                    category_value = category_value.lower()
                    result_with_correct_answer.append({key: (value, category_value)})

            else:
                print(f"No matching category found for key: {key}")
                result_with_correct_answer.append({key: (value, None)})
            

    return result_with_correct_answer
results_with_correct_answer = add_correct_anser(results, test_df)
print(len(results_with_correct_answer))
print(results_with_correct_answer)
print("to forward",len(results))

893
[{'1': ('miscellaneous', 'vehicles')}, {'2': ('religion', 'religion')}, {'3': ('politics', 'science')}, {'4': ('miscellaneous', 'religion')}, {'5': ('thetopicofthistextis:technology', 'technology')}, {'6': ('science', 'science')}, {'7': ('science', 'technology')}, {'8': ('vehicles', 'miscellaneous')}, {'9': ('thecorrectanswerwouldbe:politics', 'politics')}, {'10': ('technology', 'technology')}, {'11': ('thetopicofthistextis:sport', 'sport')}, {'12': ('politics', 'religion')}, {'13': ('technology', 'religion')}, {'14': ('technology', 'science')}, {'15': ('thecorrectanswerwouldbe:technology', 'science')}, {'16': ('technology', 'technology')}, {'17': ('vehicles', 'vehicles')}, {'18': ('politics', 'science')}, {'19': ('miscellaneous', 'science')}, {'20': ('science', 'politics')}, {'21': ('thecorrectansweris:vehicles', 'miscellaneous')}, {'22': ('politics', 'politics')}, {'23': ('politics', 'politics')}, {'24': ('science', 'science')}, {'25': ('thecorrectanswerwouldbe:miscellaneous', 's

In [None]:
#Matching the results with the correct answers
import difflib
from thefuzz import fuzz
def post_process(results):
      test_df.loc['result'] = None
      string_list = ["religion", "politics","sport","technology", "vehicles", "science","miscellaneous"]
      final_results = []
      for result in results:
            similarity = []
            for string in string_list:
                  similarity.append([fuzz.partial_ratio(list(result.values())[0][0], string),string])
            max_similarity = max(similarity)
            
            if (max_similarity[0]>60):
                  final_results.append({list(result.keys())[0]: (max_similarity[1], list(result.values())[0][1])})
                  test_df.loc[test_df['name'] == list(result.keys())[0], 'result'] = max_similarity[1]
            else:
                  final_results.append({list(result.keys())[0]: ("", list(result.values())[0][1])})
                  test_df.loc[test_df['name'] == list(result.keys())[0], 'result'] = ""
      
      test_df.to_csv(r"E:\uni\NLP group project\llama_7newsgroups.csv", index=False)
      return final_results
result_with_correct_answer_pre_post_process = results_with_correct_answer
print(result_with_correct_answer_pre_post_process)
results_with_correct_answer = post_process(results_with_correct_answer)
print(results_with_correct_answer)

[{'1': ('miscellaneous', 'vehicles')}, {'2': ('religion', 'religion')}, {'3': ('politics', 'science')}, {'4': ('miscellaneous', 'religion')}, {'5': ('thetopicofthistextis:technology', 'technology')}, {'6': ('science', 'science')}, {'7': ('science', 'technology')}, {'8': ('vehicles', 'miscellaneous')}, {'9': ('thecorrectanswerwouldbe:politics', 'politics')}, {'10': ('technology', 'technology')}, {'11': ('thetopicofthistextis:sport', 'sport')}, {'12': ('politics', 'religion')}, {'13': ('technology', 'religion')}, {'14': ('technology', 'science')}, {'15': ('thecorrectanswerwouldbe:technology', 'science')}, {'16': ('technology', 'technology')}, {'17': ('vehicles', 'vehicles')}, {'18': ('politics', 'science')}, {'19': ('miscellaneous', 'science')}, {'20': ('science', 'politics')}, {'21': ('thecorrectansweris:vehicles', 'miscellaneous')}, {'22': ('politics', 'politics')}, {'23': ('politics', 'politics')}, {'24': ('science', 'science')}, {'25': ('thecorrectanswerwouldbe:miscellaneous', 'scien

In [10]:
def extract_ground_truth_and_predictions(results_with_correct_answer):
    ground_truth = []
    predictions = []
    for result in results_with_correct_answer:
        for key, value in result.items():
            ground_truth.append(value[1])
            predictions.append(value[0])
    return ground_truth, predictions
ground_truth, predictions = extract_ground_truth_and_predictions(results_with_correct_answer)


In [11]:
def calculate_NMI(ground_truth, predictions):
    
    nmi_score = normalized_mutual_info_score(ground_truth, predictions)
    print(f"Normalized Mutual Information Score: {nmi_score}")
    return nmi_score


In [None]:
import numpy as np
from collections import Counter

def calculate_purity(predicted_labels, true_labels):
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)
    
    unique_clusters = np.unique(predicted_labels)
    
    total_instances = len(true_labels)
    
    correctly_classified = 0
    for cluster in unique_clusters:
        indices_in_cluster = np.where(predicted_labels == cluster)[0]
        labels_in_cluster = true_labels[indices_in_cluster]
        
        majority_label_count = Counter(labels_in_cluster).most_common(1)[0][1]
        
        correctly_classified += majority_label_count
    
    purity = correctly_classified / total_instances
    print(f"Purity: {purity}")
    return purity





In [None]:
def calculate_accuracy(predicted_labels, true_labels):
    if len(predicted_labels) != len(true_labels):
        raise ValueError("The length of predicted and true labels must be the same.")
    
    correct_predictions = sum(1 for pred, true in zip(predicted_labels, true_labels) if pred == true)
    
    accuracy = correct_predictions / len(true_labels)
    print(f"Accuracy: {accuracy}")
    return accuracy





In [None]:
from sklearn.metrics import f1_score
def calculate_f1_score(ground_truth, predictions):

    f1 = f1_score(ground_truth, predictions, average='micro') 
    return f1



In [15]:
calculate_NMI(ground_truth, predictions)
calculate_purity(predictions, ground_truth)
calculate_accuracy(predictions, ground_truth)
calculate_f1_score(ground_truth, predictions)

Normalized Mutual Information Score: 0.5015054134451503
Purity: 0.7066069428891377
Accuracy: 0.6786114221724524
F1 Score: 0.6786114221724524


0.6786114221724524