Imports

In [119]:
import os
import csv
import pandas as pd
import asyncio
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv

load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")



data preprocessing

In [110]:
#Laptop Path
#source_path_for_bbc = r"C:\Users\elias\Documents\NLP and speech processing"
#Desktop Path
source_path_for_bbc = r"C:\Users\super161\Documents\NLP and speech processing"


def process_folders(folder_path):
    
    documents_dict = {}
    document_list = []
    for folder_name in os.listdir(folder_path):
        folder_full_path = os.path.join(folder_path, folder_name)
        documents_dict[folder_name] = []
        
        for document_name in os.listdir(folder_full_path):
            document_full_path = os.path.join(folder_full_path, document_name)
            
            # Open and read the content of each document
            with open(document_full_path, 'r', encoding='utf-8') as file:
                document_content = file.read()
            document_list.append((folder_name, document_content, document_name))
        
    bbc_news = pd.DataFrame(document_list, columns = ['category', 'content', 'name'])

    train, test = train_test_split(bbc_news, test_size=0.2)

    train = train.sort_values(by='category')
    train_first_doc = train.groupby('category').first().reset_index()

    train_first_doc['div'] = 'train'
    test['div'] = 'test'


    return test, train_first_doc

#Data Structure: Topic, Article Text

test_df, instruction_df = process_folders(source_path_for_bbc)


Make Prompt

In [111]:
def make_prompts(bbc_instructions, bbc_data):
    prompts = []
    
    # General instructions and fixed texts
    general_instruction = (
        "You are a perfect topic modeling machine. Given a text and the different topics, "
        "you will classify the texts to the correct topic. First you will receive the topics, "
        "afterwards an example and finally the text you have to assign one of the before mentioned topics to."
    )
    topics = "The topics are business, entertainment, politics, sport and tech. Please make sure, you know the topics and their meaning."
    transition_to_examples = "Now an example for each of the categories will follow."
    transition_to_text_to_classify = (
        "Now the text, you have to classify will follow. Please assess its topic and answer only the topic of it."
    )

    # Iterate through the test DataFrame rows
    for _, test_row in bbc_data.iterrows():
        prompt = general_instruction + "\n" + topics + "\n" + transition_to_examples + "\n"

        # Iterate through instruction DataFrame to add examples
        for _, instruction_row in bbc_instructions.iterrows():
            category = instruction_row['category']
            example_text = instruction_row['content']
            prompt += f"For the following text: \n{example_text}\nThe correct answer would be: {category}\n"

        # Add the actual text to classify from the test set
        text_to_classify = test_row['content']
        prompt += transition_to_text_to_classify + "\n" + text_to_classify + "\n"
        name = f"bbc_{test_row['category']}_{test_row['name']}"
        #print(name)
        prompt_dict = {}
        prompt_dict[name] = prompt
        prompts.append(prompt_dict)
    return prompts            

prompts = make_prompts(instruction_df, test_df)
prompts = prompts[:11]
print(prompts[0].keys())

dict_keys(['bbc_tech_203.txt'])


In [117]:
semaphore = asyncio.Semaphore(10)

async def prompt_llm(prompt,  time_interval):
    async with semaphore:
        name = list(prompt.keys())[0]
        promptAI = prompt[name]   

        generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 64,
        "max_output_tokens": 8192,
        "response_mime_type": "text/plain",
        }
        model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config,
        safety_settings = {HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE},
        )
        
        chat_session = model.start_chat(
        history=[
        ]
        )
        answer = chat_session.send_message(promptAI)
        text_response = answer._result.candidates[0].content.parts[0].text
        await asyncio.sleep(time_interval)
        
        return {name : text_response}

In [118]:
genai.configure(api_key=GEMINI_API_KEY)
async def prompt_orchestrator():
      batch_size = 5
      batches = [prompts[i:i + batch_size] for i in range(0, len(prompts), batch_size)]

      # Ensure the last batch is smaller if it's less than batch_size
      if len(batches[-1]) < batch_size:
            remaining = len(batches[-1])
            batches[-1] = prompts[-remaining:]

      # If the last batch is part of the earlier slices and it's less than batch_size
      if len(prompts) % batch_size != 0:
            remaining = len(prompts) % batch_size
            batches[-1] = prompts[-remaining:]

      time_interval = 60 / 300
      avatiables = []

      for batch in batches:
            avatiables_batch = await asyncio.gather(*(prompt_llm(prompt, time_interval) for prompt in batch))
            avatiables.extend(avatiables_batch) 
      print(avatiables)
      return avatiables
results = await prompt_orchestrator()
print(results)


[{'bbc_tech_203.txt': 'entertainment \n'}, {'bbc_sport_121.txt': 'sport \n'}, {'bbc_politics_114.txt': 'politics \n'}, {'bbc_sport_168.txt': 'sport \n'}, {'bbc_business_144.txt': 'business \n'}, {'bbc_business_360.txt': 'business \n'}, {'bbc_entertainment_385.txt': 'entertainment \n'}, {'bbc_entertainment_012.txt': 'entertainment \n'}, {'bbc_sport_272.txt': 'sport \n'}, {'bbc_business_240.txt': 'business \n'}, {'bbc_tech_140.txt': 'sport \n'}]
[{'bbc_tech_203.txt': 'entertainment \n'}, {'bbc_sport_121.txt': 'sport \n'}, {'bbc_politics_114.txt': 'politics \n'}, {'bbc_sport_168.txt': 'sport \n'}, {'bbc_business_144.txt': 'business \n'}, {'bbc_business_360.txt': 'business \n'}, {'bbc_entertainment_385.txt': 'entertainment \n'}, {'bbc_entertainment_012.txt': 'entertainment \n'}, {'bbc_sport_272.txt': 'sport \n'}, {'bbc_business_240.txt': 'business \n'}, {'bbc_tech_140.txt': 'sport \n'}]
