In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from sklearn.metrics import normalized_mutual_info_score
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import asyncio
import tensorflow as tf
import torch
import json
from hugchat import hugchat
load_dotenv()
LLAMA = os.getenv("LLAMA")
login(token=LLAMA)

#get the cookies
base_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
json_file_path = os.path.join(base_path, 'cookies.json')
with open(json_file_path, 'r') as file:
    cookies = json.load(file)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#Laptop Path
#source_path_for_bbc = r"C:\Users\elias\Documents\NLP and speech processing"
#Desktop Path
source_path_for_bbc = r"C:\Users\super161\Documents\NLP and speech processing"


def process_folders(folder_path):
    
    documents_dict = {}
    document_list = []
    for folder_name in os.listdir(folder_path):
        folder_full_path = os.path.join(folder_path, folder_name)
        documents_dict[folder_name] = []
        
        for document_name in os.listdir(folder_full_path):
            document_full_path = os.path.join(folder_full_path, document_name)
            
            with open(document_full_path, 'r', encoding='utf-8') as file:
                document_content = file.read()
            document_list.append((folder_name, document_content, f"bbc_{folder_name}_{document_name}"))
        
    bbc_news = pd.DataFrame(document_list, columns = ['category', 'content', 'name'])
    bbc_news = bbc_news.dropna(how="any")
    train, test = train_test_split(bbc_news, test_size=0.2, random_state=42)

    train = train.sort_values(by='category')
    train_first_doc = train.groupby('category').first().reset_index()

    train_first_doc['div'] = 'train'
    test['div'] = 'test'


    return test, train_first_doc

#Data Structure: Topic, Article Text

test_df, instruction_df = process_folders(source_path_for_bbc)
print(test_df.head())


      category                                            content  \
414   business  UK house prices dip in November\n\nUK house pr...   
420   business  LSE 'sets date for takeover deal'\n\nThe Londo...   
1644     sport  Harinordoquy suffers France axe\n\nNumber eigh...   
416   business  Barclays shares up on merger talk\n\nShares in...   
1232  politics  Campaign 'cold calls' questioned\n\nLabour and...   

                      name   div  
414   bbc_business_415.txt  test  
420   bbc_business_421.txt  test  
1644     bbc_sport_332.txt  test  
416   bbc_business_417.txt  test  
1232  bbc_politics_337.txt  test  


In [None]:
def make_prompts(bbc_instructions, bbc_data):
    prompts = []
    
    general_instruction = (
        "You are a perfect topic modeling machine. Given a text and the different topics, "
        "you will classify the texts to the correct topic. First you will receive the topics, "
        "afterwards an example and finally the text you have to assign one of the before mentioned topics to."
    )
    topics = "The topics are business, entertainment, politics, sport and tech. Please make sure, you know the topics and their meaning."
    transition_to_examples = "Now an example for each of the categories will follow."
    transition_to_text_to_classify = (
        "Now the text, you have to classify will follow. Please assess its topic and answer only the topic of it. The answer must be a string."
    )

    for _, test_row in bbc_data.iterrows():
        prompt = general_instruction + "\n" + topics + "\n" + transition_to_examples + "\n"

        for _, instruction_row in bbc_instructions.iterrows():
            category = instruction_row['category']
            example_text = instruction_row['content']
            prompt += f"For the following text: \n{example_text}\nThe correct answer would be: {category}\n"

        text_to_classify = test_row['content']
        prompt += transition_to_text_to_classify + "\n" + text_to_classify + "\n"
        name = f"{test_row['name']}"
        prompt_dict = {}
        prompt_dict[name] = prompt
        prompts.append(prompt_dict)
    return prompts            

prompts = make_prompts(instruction_df, test_df)
prompts = prompts[90:]
print(len(prompts))

355


In [None]:
async def huggingchat(prompt):
      semaphore = asyncio.Semaphore(5)
      async with semaphore:
            chatbot = hugchat.ChatBot(cookies)  
            name = list(prompt.keys())[0]
            promptAI = prompt[name]
            id = chatbot.new_conversation()
            chatbot.change_conversation(id)

            answer_from_chatbot = chatbot.chat(promptAI)
            answer: str = answer_from_chatbot['text']
            
            return {name: answer}

In [None]:
#Making sure, that the prompting frequency does not exceed the rate limit of the API
async def prompt_orchestrator(prompts):
    batch_size = 5
    avatiables = []
    batches = [prompts[i:i + batch_size] for i in range(0, len(prompts), batch_size)]

    
    if len(batches[-1]) < batch_size:
        remaining = len(batches[-1])
        batches[-1] = prompts[-remaining:]

    
    if len(prompts) % batch_size != 0:
        remaining = len(prompts) % batch_size
        batches[-1] = prompts[-remaining:]

    max_retries = 5  
    retry_count = 0  
    current_batch_index = 0  

    while retry_count <= max_retries:
        try:
            counter = current_batch_index * batch_size  
            for batch_index in range(current_batch_index, len(batches)):
                batch = batches[batch_index]
                avatiables_batch = []

                for prompt in batch:
                    response = await huggingchat(prompt)
                    avatiables_batch.append(response)
                    await asyncio.sleep(6)  

                avatiables.extend(avatiables_batch)
                counter += batch_size
                await asyncio.sleep(8)  
                print(f"{counter} prompts processed.")
                current_batch_index = batch_index + 1  
                retry_count = 0  

            break  

        except Exception as e:
            retry_count += 1
            if retry_count > max_retries:
                print(f"Failed after {max_retries} retries. Returning the accumulated results.")
                return avatiables
            print(f"An error occurred: {e}. Retrying in 15 seconds... (Attempt {retry_count}/{max_retries})")
            await asyncio.sleep(15)  

    return avatiables

results = await prompt_orchestrator(prompts)

print(results)

In [26]:
def add_correct_anser(results, test_df):
    result_with_correct_answer = []
    for result in results:
       for key, value in result.items():
            value = value.replace("\n", "")
            value = value.replace(" \n", "")
            value = value.replace(" ","")
            value = value.lower()
            matching_rows = test_df.loc[test_df['name'] == key, 'category']
            
            if not matching_rows.empty:
                category_value = matching_rows.values[0]
                category_value = category_value.lower()
                result_with_correct_answer.append({key: (value, category_value)})

            else:
                print(f"No matching category found for key: {key}")
                result_with_correct_answer.append({key: (value, None)})
        
    return result_with_correct_answer

results_with_correct_answer = add_correct_anser(results, test_df)
print(results_with_correct_answer)
print(len(results_with_correct_answer))

[{'bbc_tech_246.txt': ('tech', 'tech')}, {'bbc_tech_099.txt': ('tech', 'tech')}, {'bbc_entertainment_376.txt': ('entertainment', 'entertainment')}, {'bbc_business_068.txt': ('business', 'business')}, {'bbc_entertainment_195.txt': ('entertainment', 'entertainment')}, {'bbc_politics_290.txt': ('politics', 'politics')}, {'bbc_business_365.txt': ('business', 'business')}, {'bbc_entertainment_314.txt': ('entertainment', 'entertainment')}, {'bbc_business_069.txt': ('business', 'business')}, {'bbc_business_402.txt': ('business', 'business')}, {'bbc_sport_194.txt': ('sport', 'sport')}, {'bbc_tech_021.txt': ('tech', 'tech')}, {'bbc_tech_244.txt': ('entertainment', 'tech')}, {'bbc_sport_423.txt': ('sport', 'sport')}, {'bbc_entertainment_289.txt': ('thecorrectansweris:entertainment', 'entertainment')}, {'bbc_politics_190.txt': ('politics', 'politics')}, {'bbc_politics_011.txt': ('politics', 'politics')}, {'bbc_tech_204.txt': ('thecorrectanswerwouldbethefinalanswerreceived10,000peryear.', 'tech')}

In [None]:
#matching the results with the correct answers
import difflib
from thefuzz import fuzz
def post_process(results):
      test_df.loc['result'] = None
      string_list = ["business", "entertainment", "politics","sport","tech"]
      final_results = []
      for result in results:
            similarity = []
            for string in string_list:
                  similarity.append([fuzz.partial_ratio(list(result.values())[0][0], string),string])
            max_similarity = max(similarity)
            
            if (max_similarity[0]>60):
                  final_results.append({list(result.keys())[0]: (max_similarity[1], list(result.values())[0][1])})
                  test_df.loc[test_df['name'] == list(result.keys())[0], 'result'] = max_similarity[1]

            else:
                  final_results.append({list(result.keys())[0]: ("", list(result.values())[0][1])})
                  test_df.loc[test_df['name'] == list(result.keys())[0], 'result'] = ""


      test_df.to_csv(r"E:\uni\NLP group project\llama_bbc.csv", index=False)
      return final_results
result_with_correct_answer_pre_post_process = results_with_correct_answer
print(result_with_correct_answer_pre_post_process)
results_with_correct_answer = post_process(results_with_correct_answer)
print(results_with_correct_answer)

[{'bbc_tech_246.txt': ('tech', 'tech')}, {'bbc_tech_099.txt': ('tech', 'tech')}, {'bbc_entertainment_376.txt': ('entertainment', 'entertainment')}, {'bbc_business_068.txt': ('business', 'business')}, {'bbc_entertainment_195.txt': ('entertainment', 'entertainment')}, {'bbc_politics_290.txt': ('politics', 'politics')}, {'bbc_business_365.txt': ('business', 'business')}, {'bbc_entertainment_314.txt': ('entertainment', 'entertainment')}, {'bbc_business_069.txt': ('business', 'business')}, {'bbc_business_402.txt': ('business', 'business')}, {'bbc_sport_194.txt': ('sport', 'sport')}, {'bbc_tech_021.txt': ('tech', 'tech')}, {'bbc_tech_244.txt': ('entertainment', 'tech')}, {'bbc_sport_423.txt': ('sport', 'sport')}, {'bbc_entertainment_289.txt': ('thecorrectansweris:entertainment', 'entertainment')}, {'bbc_politics_190.txt': ('politics', 'politics')}, {'bbc_politics_011.txt': ('politics', 'politics')}, {'bbc_tech_204.txt': ('thecorrectanswerwouldbethefinalanswerreceived10,000peryear.', 'tech')}

In [28]:
def extract_ground_truth_and_predictions(results_with_correct_answer):
    ground_truth = []
    predictions = []
    for result in results_with_correct_answer:
        for key, value in result.items():
            ground_truth.append(value[1])
            predictions.append(value[0])
    return ground_truth, predictions
ground_truth, predictions = extract_ground_truth_and_predictions(results_with_correct_answer)

In [29]:
def calculate_NMI(ground_truth, predictions):
    
    nmi_score = normalized_mutual_info_score(ground_truth, predictions)
    print(f"Normalized Mutual Information Score: {nmi_score}")
    return nmi_score


In [None]:
import numpy as np
from collections import Counter

def calculate_purity(predicted_labels, true_labels):
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)
    
    unique_clusters = np.unique(predicted_labels)
    
    total_instances = len(true_labels)
    
    correctly_classified = 0
    for cluster in unique_clusters:
        indices_in_cluster = np.where(predicted_labels == cluster)[0]
        labels_in_cluster = true_labels[indices_in_cluster]
        
        majority_label_count = Counter(labels_in_cluster).most_common(1)[0][1]
        
        correctly_classified += majority_label_count
    
    purity = correctly_classified / total_instances
    print(f"Purity: {purity}")
    return purity





In [None]:
def calculate_accuracy(predicted_labels, true_labels):
    if len(predicted_labels) != len(true_labels):
        raise ValueError("The length of predicted and true labels must be the same.")
    
    correct_predictions = sum(1 for pred, true in zip(predicted_labels, true_labels) if pred == true)
    
    accuracy = correct_predictions / len(true_labels)
    print(f"Accuracy: {accuracy}")
    return accuracy





In [None]:
from sklearn.metrics import f1_score
def calculate_f1_score(ground_truth, predictions):

    f1 = f1_score(ground_truth, predictions, average='micro') 
    print(f"F1 Score: {f1}")
    return f1



In [33]:
calculate_NMI(ground_truth, predictions)
calculate_purity(predictions, ground_truth)
calculate_accuracy(predictions, ground_truth)
calculate_f1_score(ground_truth, predictions)

Normalized Mutual Information Score: 0.806337057133388
Purity: 0.923943661971831
Accuracy: 0.9211267605633803
F1 Score: 0.9211267605633803


0.9211267605633803