Imports

In [7]:
import os
import csv
import pandas as pd
import asyncio
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from sklearn.metrics import normalized_mutual_info_score

load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")



data preprocessing

In [8]:
#Laptop Path
#source_path_for_bbc = r"C:\Users\elias\Documents\NLP and speech processing"
#Desktop Path
source_path_for_bbc = r"C:\Users\super161\Documents\NLP and speech processing"


def process_folders(folder_path):
    
    documents_dict = {}
    document_list = []
    for folder_name in os.listdir(folder_path):
        folder_full_path = os.path.join(folder_path, folder_name)
        documents_dict[folder_name] = []
        
        for document_name in os.listdir(folder_full_path):
            document_full_path = os.path.join(folder_full_path, document_name)
            
            # Open and read the content of each document
            with open(document_full_path, 'r', encoding='utf-8') as file:
                document_content = file.read()
            document_list.append((folder_name, document_content, f"bbc_{folder_name}_{document_name}"))
        
    bbc_news = pd.DataFrame(document_list, columns = ['category', 'content', 'name'])
    # Remove Nan
    bbc_news = bbc_news.dropna(how="any")
    train, test = train_test_split(bbc_news, test_size=0.2)

    train = train.sort_values(by='category')
    train_first_doc = train.groupby('category').first().reset_index()

    train_first_doc['div'] = 'train'
    test['div'] = 'test'


    return test, train_first_doc

#Data Structure: Topic, Article Text

test_df, instruction_df = process_folders(source_path_for_bbc)
print(test_df.head())


      category                                            content  \
389   business  Wal-Mart to pay $14m in gun suit\n\nThe world'...   
935   politics  Lords wrong on detainees - Straw\n\nJack Straw...   
77    business  Deadline nears for Fiat-GM deal\n\nFiat and Ge...   
1044  politics  Blair returns from peace mission\n\nPrime Mini...   
2032      tech  Man auctions ad space on forehead\n\nA 20-year...   

                      name   div  
389   bbc_business_390.txt  test  
935   bbc_politics_040.txt  test  
77    bbc_business_078.txt  test  
1044  bbc_politics_149.txt  test  
2032      bbc_tech_209.txt  test  


Make Prompt

In [9]:
def make_prompts(bbc_instructions, bbc_data):
    prompts = []
    
    # General instructions and fixed texts
    general_instruction = (
        "You are a perfect topic modeling machine. Given a text and the different topics, "
        "you will classify the texts to the correct topic. First you will receive the topics, "
        "afterwards an example and finally the text you have to assign one of the before mentioned topics to."
    )
    topics = "The topics are business, entertainment, politics, sport and tech. Please make sure, you know the topics and their meaning."
    transition_to_examples = "Now an example for each of the categories will follow."
    transition_to_text_to_classify = (
        "Now the text, you have to classify will follow. Please assess its topic and answer only the topic of it."
    )

    # Iterate through the test DataFrame rows
    for _, test_row in bbc_data.iterrows():
        prompt = general_instruction + "\n" + topics + "\n" + transition_to_examples + "\n"

        # Iterate through instruction DataFrame to add examples
        for _, instruction_row in bbc_instructions.iterrows():
            category = instruction_row['category']
            example_text = instruction_row['content']
            prompt += f"For the following text: \n{example_text}\nThe correct answer would be: {category}\n"

        # Add the actual text to classify from the test set
        text_to_classify = test_row['content']
        prompt += transition_to_text_to_classify + "\n" + text_to_classify + "\n"
        name = f"{test_row['name']}"
        #print(name)
        prompt_dict = {}
        prompt_dict[name] = prompt
        prompts.append(prompt_dict)
    return prompts            

prompts = make_prompts(instruction_df, test_df)

print(len(prompts))

445


In [10]:


async def prompt_llm(prompt):
    semaphore = asyncio.Semaphore(5)
    async with semaphore:
        name = list(prompt.keys())[0]
        promptAI = prompt[name]   

        generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 64,
        "max_output_tokens": 8192,
        "response_mime_type": "text/plain",
        }
        model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config,
        safety_settings = {HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE},
        )
        
        chat_session = model.start_chat(
        history=[])
        answer = chat_session.send_message(promptAI)
        text_response = answer._result.candidates[0].content.parts[0].text
        
        return {name : text_response}

In [11]:
genai.configure(api_key=GEMINI_API_KEY)

async def prompt_orchestrator():
      batch_size = 5
      avatiables = []
      batches = [prompts[i:i + batch_size] for i in range(0, len(prompts), batch_size)]

      # Ensure the last batch is smaller if it's less than batch_size
      if len(batches[-1]) < batch_size:
            remaining = len(batches[-1])
            batches[-1] = prompts[-remaining:]

      # If the last batch is part of the earlier slices and it's less than batch_size
      if len(prompts) % batch_size != 0:
            remaining = len(prompts) % batch_size
            batches[-1] = prompts[-remaining:]

      for batch in batches:
            avatiables_batch = await asyncio.gather(*(prompt_llm(prompt) for prompt in batch))
            avatiables.extend(avatiables_batch) 
            print("sleeping")
            await asyncio.sleep(2)
      print(avatiables)
      return avatiables

results = await prompt_orchestrator()

print(results)


sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
[{'bbc_business_390.txt': 'business \n'}, {'bbc_politics_040.txt': 'politics \n'}, {'bbc_business_078.txt': 'business \n'}, {'bbc_politics_149.txt': 'politics \n'}, {'bbc_tech_209.txt': 'business \n'

In [21]:
def add_correct_anser(results, test_df):
    result_with_correct_answer = []

    for result in results:
       for key, value in result.items():
            value = value.replace("\n", "")
            value = value.replace(" \n", "")
            value = value.replace(" ","")
            value = value.lower()
            matching_rows = test_df.loc[test_df['name'] == key, 'category']
            
            if not matching_rows.empty:
                category_value = matching_rows.values[0]
                category_value = category_value.lower()
                result_with_correct_answer.append({key: (value, category_value)})

            else:
                print(f"No matching category found for key: {key}")
                result_with_correct_answer.append({key: (value, None)})
    
    return result_with_correct_answer

results_with_correct_answer = add_correct_anser(results, test_df)
print(test_df.head())
#print(results)
print(results_with_correct_answer)
print(len(results_with_correct_answer))

      category                                            content  \
389   business  Wal-Mart to pay $14m in gun suit\n\nThe world'...   
935   politics  Lords wrong on detainees - Straw\n\nJack Straw...   
77    business  Deadline nears for Fiat-GM deal\n\nFiat and Ge...   
1044  politics  Blair returns from peace mission\n\nPrime Mini...   
2032      tech  Man auctions ad space on forehead\n\nA 20-year...   

                      name   div    result  
389   bbc_business_390.txt  test  business  
935   bbc_politics_040.txt  test  politics  
77    bbc_business_078.txt  test  business  
1044  bbc_politics_149.txt  test  politics  
2032      bbc_tech_209.txt  test  business  
[{'bbc_business_390.txt': ('business', 'business')}, {'bbc_politics_040.txt': ('politics', 'politics')}, {'bbc_business_078.txt': ('business', 'business')}, {'bbc_politics_149.txt': ('politics', 'politics')}, {'bbc_tech_209.txt': ('business', 'tech')}, {'bbc_business_267.txt': ('business', 'business')}, {'bbc_busi

In [34]:
import difflib
from thefuzz import fuzz
def post_process(results):
      test_df.loc['result'] = None
      string_list = ["business", "entertainment","sport","tech", "politics"]
      final_results = []
      for result in results:
            similarity = []
            for string in string_list:
                  similarity.append([fuzz.partial_ratio(list(result.values())[0][0], string),string])
            max_similarity = max(similarity)
            
            if (max_similarity[0]>60):
                  final_results.append({list(result.keys())[0]: (max_similarity[1], list(result.values())[0][1])})
                  test_df.loc[test_df['name'] == list(result.keys())[0], 'result'] = max_similarity[1]
            else:
                  final_results.append({list(result.keys())[0]: ("", list(result.values())[0][1])})
                  test_df.loc[test_df['name'] == list(result.keys())[0], 'result'] = ""
      
      test_df.to_csv(r"E:\uni\NLP group project\gemini_bbc.csv", index=False)
      return final_results
result_with_correct_answer_pre_post_process = results_with_correct_answer
print(result_with_correct_answer_pre_post_process)
results_with_correct_answer = post_process(results_with_correct_answer)
print(results_with_correct_answer)

[{'bbc_business_390.txt': ('business', 'business')}, {'bbc_politics_040.txt': ('politics', 'politics')}, {'bbc_business_078.txt': ('business', 'business')}, {'bbc_politics_149.txt': ('politics', 'politics')}, {'bbc_tech_209.txt': ('business', 'tech')}, {'bbc_business_267.txt': ('business', 'business')}, {'bbc_business_286.txt': ('business', 'business')}, {'bbc_tech_100.txt': ('tech', 'tech')}, {'bbc_sport_470.txt': ('sport', 'sport')}, {'bbc_entertainment_125.txt': ('entertainment', 'entertainment')}, {'bbc_entertainment_370.txt': ('entertainment', 'entertainment')}, {'bbc_sport_172.txt': ('sport', 'sport')}, {'bbc_business_324.txt': ('business', 'business')}, {'bbc_entertainment_087.txt': ('entertainment', 'entertainment')}, {'bbc_entertainment_042.txt': ('entertainment', 'entertainment')}, {'bbc_business_137.txt': ('business', 'business')}, {'bbc_entertainment_150.txt': ('entertainment', 'entertainment')}, {'bbc_business_023.txt': ('business', 'business')}, {'bbc_sport_289.txt': ('sp

In [35]:
def extract_ground_truth_and_predictions(results_with_correct_answer):
    ground_truth = []
    predictions = []
    for result in results_with_correct_answer:
        for key, value in result.items():
            ground_truth.append(value[1])
            predictions.append(value[0])
    return ground_truth, predictions
ground_truth, predictions = extract_ground_truth_and_predictions(results_with_correct_answer)

In [36]:
def calculate_NMI(ground_truth, predictions):
    
    nmi_score = normalized_mutual_info_score(ground_truth, predictions)
    print(f"Normalized Mutual Information Score: {nmi_score}")
    return nmi_score


In [37]:
import numpy as np
from collections import Counter

def calculate_purity(predicted_labels, true_labels):
    # Convert lists to numpy arrays for easier indexing
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)
    
    # Get unique clusters
    unique_clusters = np.unique(predicted_labels)
    
    # Total number of instances
    total_instances = len(true_labels)
    
    # Calculate the number of correctly classified instances in each cluster
    correctly_classified = 0
    for cluster in unique_clusters:
        # Get the true labels of instances in the current cluster
        indices_in_cluster = np.where(predicted_labels == cluster)[0]
        labels_in_cluster = true_labels[indices_in_cluster]
        
        # Determine the most common true label in this cluster
        majority_label_count = Counter(labels_in_cluster).most_common(1)[0][1]
        
        # Add the number of correctly classified instances in this cluster
        correctly_classified += majority_label_count
    
    # Calculate purity
    purity = correctly_classified / total_instances
    print(f"Purity: {purity}")
    return purity





In [38]:
def calculate_accuracy(predicted_labels, true_labels):
    # Ensure that the predicted_labels and true_labels have the same length
    if len(predicted_labels) != len(true_labels):
        raise ValueError("The length of predicted and true labels must be the same.")
    
    # Count the number of correct predictions
    correct_predictions = sum(1 for pred, true in zip(predicted_labels, true_labels) if pred == true)
    
    # Calculate accuracy
    accuracy = correct_predictions / len(true_labels)
    print(f"Accuracy: {accuracy}")
    return accuracy





In [39]:
from sklearn.metrics import f1_score
def calculate_f1_score(ground_truth, predictions):

    f1 = f1_score(ground_truth, predictions, average='micro') # Because there might be over/ under representation of some classes
    print(f"F1 Score: {f1}")
    return f1



In [40]:
calculate_NMI(ground_truth, predictions)
calculate_purity(predictions, ground_truth)
calculate_accuracy(predictions, ground_truth)
calculate_f1_score(ground_truth, predictions)

Normalized Mutual Information Score: 0.8904901758105461
Purity: 0.9595505617977528
Accuracy: 0.9595505617977528
F1 Score: 0.9595505617977528


0.9595505617977528