# Notebook pour effectuer les résumés

In [1]:
# Classiques
import pandas as pd
import numpy as np
from IPython.display import Markdown
import requests

import asyncio
import async_timeout

import getpass
import os

from tqdm import tqdm
tqdm.pandas()

from tqdm.asyncio import tqdm
lock = asyncio.Lock()
import time

#Langchain and LLMs
import json
from langchain_community.chat_models import ChatOllama
from langchain.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import StrOutputParser,JsonOutputParser
from langchain_core.messages import HumanMessage, SystemMessage,AIMessage
from langchain_core.prompt_values import StringPromptValue

from langchain_openai import ChatOpenAI

## Prompts utilisés

In [2]:
example_json={"topic":"internal politic","explanation":"It's about Trump."}
example_json_string=json.dumps(example_json)

short_prompt = PromptTemplate(
        template="""
        Please summarize the following text:
        {document}
        """,
        input_variables=["document"],
    )
elaborated_prompt_random = PromptTemplate(
        template="""
        Read the following text and give a short, clear summary of the main facts or ideas about it.
        Use markdown style and bulletpoints.

        Text to summarize:
        {document}
        """,
        input_variables=["document"],
    )
elaborated_prompt_topic = PromptTemplate(
        template="""
        Read the following text and give a short, clear summary of the main facts or ideas about it.
        Consider this text is related to the topic : {topic}
        Use markdown style and bulletpoints.

        Text to summarize:
        {document}
        """,
        input_variables=["document","topic"],
    )

prompts={"short":short_prompt,"elaborate":elaborated_prompt_random}

## Fonctions utilisées

Fonction utilisée pour initialiser la clé API OpenAI

In [None]:
def initiateChatGPT(forceReset=False):
    if forceReset:
        # Suppression de la variable d'environnement
        try:
            del os.environ["OPENAI_API_KEY"]
            print("API key removed from environment.")
        except KeyError:
            print("API key was not set.")
        
    if not os.environ.get("OPENAI_API_KEY"):
        os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Fonction utilisée pour calculer le prix d'utilisation OpenAI

In [None]:
def gptPrice(response):
    input_token_price_per_million=0.5
    input_token=1e6
    input_token_price_per_token=input_token_price_per_million/input_token
    output_token_price_per_million=1.5
    output_token=1e6
    output_token_price_per_token=output_token_price_per_million/output_token
    input_price=response.usage_metadata['input_tokens']*input_token_price_per_token
    output_price=response.usage_metadata['output_tokens']*output_token_price_per_token
    total_price=input_price+output_price
    return total_price

Fonction utilisée pour appeller un appel asynchrone à un llm

In [6]:
async def call_llm(chain,doc,timeout=200):
    try:
        async with async_timeout.timeout(timeout):
            response = await chain.ainvoke({"document": doc})
            return response
    except asyncio.TimeoutError:
        print("Timeout!")
        # Si le temps est dépassé le résumé sera "Error timeout" et on passera au résumé suivant, évite les "divergences/hallucinations" des LLMs
        return "Error timeout"
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error processing request"


Fonction permettant d'initier l'appel à un llm pour effectuer un résumé

In [48]:
async def summarize(df,prompts,models=None,start_idx=0,end_idx=None,remote_url=None):

    if end_idx is None:
        end_idx = len(df)

    #Nombre maximal de token en input
    num_ctx=4096
    #Temperature pour les appels llms
    temperature=0
    #Temps pendant lequel on attend la réponse du llm, au delà on génère un Erreur timeout et on passe au suivant (ne marche pas pour le remote url)
    timeout=200

    last_execution_time = None
    call_count = 0
    limit_per_minute = 3
    interval = 60.0 / limit_per_minute

    for i in tqdm(range(start_idx, end_idx, 1), desc="Processing posts"):
            current_row = df.iloc[i]
            #Get the model,prompt type
            current_model=df.loc[i,'model_name']
            current_prompt_type=df.loc[i,'prompt_type']
            current_doc=df.loc[i,'clustered_text']
            current_summary=df.loc[i,'summary']
            token_Initial=df.loc[i,'token_number']

            if models is None or (isinstance(models, list) and current_model in models) or current_model == models:
                #Le modèle courant est un des modèles sur lesquel on souhaite effectuer un résumé donc on poursuit
                if 'done' not in df.columns or (pd.isna(df.loc[i, 'done']) or df.loc[i, 'done'] != True):
                    #La colonne done n'existe pas ou si elle existe son contenu n'est pas renseigné ou n'est pas True, donc on fait des chose

                    #Selon le modèle courant on défini le llm utilisé
                    if current_model=="gpt3.5":
                        llm = ChatOpenAI(model="gpt-3.5-turbo-0125",streaming=False,temperature=temperature,max_tokens=num_ctx)
                    else:
                        if remote_url:
                            llm = ChatOllama(base_url=remote_url, model=current_model,streaming=False,safe_mode=False,verbose=True,temperature=temperature,num_ctx=num_ctx)
                        else:
                            llm = ChatOllama(model=current_model,streaming=False,safe_mode=False,verbose=True,temperature=temperature,num_ctx=num_ctx)
                    
                    #On défini la chaine d'appel
                    chain = prompts[current_prompt_type.lower()] | llm 

                    #On réalise l'appel au llm
                    try:
                        if current_model=="gpt3.5":
                            if call_count >= limit_per_minute:
                                time_elapsed = time.perf_counter() - last_execution_time
                                if time_elapsed < 60:
                                    print(f"Rate Limit for gpt3.5 turbo reached => sleep {60 - time_elapsed} seconds")
                                    time.sleep(60 - time_elapsed)
                                call_count = 0
                            
                            last_execution_time = time.perf_counter()
                            call_count += 1

                        #C'est ici qu'on réalise effectivement l'appel au llm en local, ou sur machine distante ou via openAI
                        start_time = time.perf_counter()
                        if remote_url:
                            #Lors d'un appel distant l'asynchronisme marche mal, on ne bénéficie donc pas de la gestion du timeout, ATTENTION il faut donc garder un oeil sur l'avancement....
                            response = chain.invoke({"document": current_doc})
                        else:
                            
                            response = await call_llm(chain,current_doc,timeout)
                        end_time = time.perf_counter()
                        execution_time = end_time - start_time

                        async with lock:  # Acquire the lock before modifying the DataFrame
                            if isinstance(response, str) and response == "Error timeout":
                                error_message=f"Timeout error at : {i} for model {current_model} prompt type {current_prompt_type}"
                                print(error_message)
                                df.at[i, 'summary']="Error timeout"
                                df.at[i, 'error'] = error_message
                                df.at[i, 'done'] = True
                            elif isinstance(response, str) and response =="Error processing request":
                                error_message=f"Processing error at : {i} for model {current_model} prompt type {current_prompt_type}"
                                print(error_message)
                                df.at[i, 'error'] = error_message
                                df.at[i, 'done'] = False
                            else:
                                df.at[i, 'summary'] = response.content
                                if current_model!="gpt3.5":
                                    df.at[i, 'duration'] = response.response_metadata['total_duration']/1e9
                                    df.at[i, 'ratio'] = response.response_metadata['eval_count']/token_Initial*100
                                    df.at[i, 'input_tokens'] = response.response_metadata['prompt_eval_count']
                                else:
                                    df.at[i, 'duration'] = execution_time
                                    df.at[i, 'ratio'] = response.response_metadata['token_usage']['completion_tokens']/token_Initial*100
                                    df.at[i, 'input_tokens'] = response.usage_metadata['input_tokens']
                                    df.at[i, 'price'] = gptPrice(response)
                                df.at[i, 'done'] = True
                                df.at[i, 'error'] = np.nan
                    except Exception as e:
                        # Gestion des erreurs, sauvegarde dans un fichier et levée de l'exception
                        error_message = f"Error processing rows {i} : {e}"
                        print(error_message)
                        async with lock:
                            df.at[i, 'error'] = error_message
                            df.at[i, 'done'] = False
                else:
                    #La colonne done existe et sont contenu est True, donc on fait rien
                    continue
            else:
                #Le modèle courant n'est pas un des modèles retenu pour générer le résumé, on passe
                continue
    return df

## On effectue les résumés

On défini ici les chemins pour le csv d'entrée et celui de sortie

In [None]:
input_file_path="../data/processed/preclassified_clusters_V2.xlsx"
output_file_path="../data/processed/temps/df_with_summaries.csv"

On récupère le dataframe

In [3]:
# Extraire l'extension du fichier
file_extension = os.path.splitext(input_file_path)[1].lower()

# Charger le fichier en fonction de l'extension
if file_extension == '.csv':
    df = pd.read_csv(input_file_path)
elif file_extension == '.xlsx' or file_extension == '.xls':
    df = pd.read_excel(input_file_path)
elif file_extension == '.json':
    df = pd.read_json(input_file_path)
else:
    raise ValueError(f"Unsupported file extension: {file_extension}")

On défini ici l'url de la machine distante qui héberge le llm (optionnel)

In [4]:
# pod_id="nq1t0ojmvfh9th"
# remote_url=f"https://{pod_id}-11434.proxy.runpod.net/"

On active ou pas l'initialisation de la clé API de OpenAI

In [None]:
if not os.environ.get("OPENAI_API_KEY"):
        initiateChatGPT(True)

On défini la liste des modèles que l'on souhaite utiliser pour générer les résumés

In [5]:

use_this_models=["llama3.1","mistral","qwen2","gemma2","phi3","gpt3.5"]


On effectue les résumés

In [54]:
df = await summarize(df,prompts=prompts,models=use_this_models,start_idx=0,end_idx=2400,remote_url=None)

go


Processing posts:   6%|▌         | 137/2400 [00:20<05:30,  6.84it/s]

Timeout!
Timeout error at : 136 for model phi3 prompt type Elaborate


Processing posts:  11%|█▏        | 275/2400 [00:46<06:45,  5.25it/s]

Timeout!
Timeout error at : 274 for model phi3 prompt type Short


Processing posts:  22%|██▏       | 533/2400 [01:06<03:34,  8.70it/s]

Timeout!
Timeout error at : 532 for model phi3 prompt type Elaborate


Processing posts:  31%|███       | 737/2400 [01:26<02:59,  9.29it/s]

Timeout!
Timeout error at : 736 for model phi3 prompt type Elaborate


Processing posts:  36%|███▌      | 857/2400 [01:46<03:10,  8.11it/s]

Timeout!
Timeout error at : 856 for model phi3 prompt type Elaborate


Processing posts:  36%|███▌      | 863/2400 [02:06<04:41,  5.46it/s]

Timeout!
Timeout error at : 862 for model phi3 prompt type Short


Processing posts:  37%|███▋      | 893/2400 [02:26<06:01,  4.17it/s]

Timeout!
Timeout error at : 892 for model phi3 prompt type Elaborate


Processing posts:  50%|█████     | 1205/2400 [03:02<02:51,  6.96it/s]

Timeout!
Timeout error at : 1204 for model phi3 prompt type Elaborate


Processing posts:  52%|█████▏    | 1241/2400 [03:22<03:37,  5.32it/s]

Timeout!
Timeout error at : 1240 for model phi3 prompt type Elaborate


Processing posts:  55%|█████▌    | 1324/2400 [03:42<03:37,  4.95it/s]

Timeout!
Timeout error at : 1323 for model mistral prompt type Elaborate


Processing posts:  57%|█████▋    | 1361/2400 [04:02<04:19,  4.00it/s]

Timeout!
Timeout error at : 1360 for model phi3 prompt type Elaborate


Processing posts:  59%|█████▉    | 1426/2400 [04:22<04:18,  3.77it/s]

Timeout!
Timeout error at : 1425 for model mistral prompt type Short


Processing posts:  60%|██████    | 1445/2400 [04:42<05:28,  2.91it/s]

Timeout!
Timeout error at : 1444 for model phi3 prompt type Elaborate


Processing posts:  61%|██████    | 1463/2400 [05:02<06:47,  2.30it/s]

Timeout!
Timeout error at : 1462 for model phi3 prompt type Short


Processing posts:  65%|██████▍   | 1553/2400 [05:22<04:45,  2.96it/s]

Timeout!
Timeout error at : 1552 for model phi3 prompt type Elaborate


Processing posts:  68%|██████▊   | 1625/2400 [05:42<04:05,  3.16it/s]

Timeout!
Timeout error at : 1624 for model phi3 prompt type Elaborate


Processing posts:  68%|██████▊   | 1631/2400 [06:02<05:34,  2.30it/s]

Timeout!
Timeout error at : 1630 for model phi3 prompt type Short


Processing posts:  69%|██████▉   | 1661/2400 [06:22<05:59,  2.06it/s]

Timeout!
Timeout error at : 1660 for model phi3 prompt type Elaborate


Processing posts:  70%|██████▉   | 1672/2400 [06:43<07:33,  1.60it/s]

Timeout!
Timeout error at : 1671 for model mistral prompt type Elaborate


Processing posts:  70%|██████▉   | 1673/2400 [07:02<10:39,  1.14it/s]

Timeout!
Timeout error at : 1672 for model phi3 prompt type Elaborate


Processing posts:  71%|███████   | 1697/2400 [07:22<10:08,  1.16it/s]

Timeout!
Timeout error at : 1696 for model phi3 prompt type Elaborate


Processing posts:  86%|████████▌ | 2057/2400 [08:45<01:56,  2.93it/s]

Timeout!
Timeout error at : 2056 for model phi3 prompt type Elaborate


Processing posts:  88%|████████▊ | 2117/2400 [09:05<01:35,  2.96it/s]

Timeout!
Timeout error at : 2116 for model phi3 prompt type Elaborate


Processing posts:  94%|█████████▎| 2249/2400 [09:42<00:44,  3.41it/s]

Timeout!
Timeout error at : 2248 for model phi3 prompt type Elaborate


Processing posts:  96%|█████████▌| 2309/2400 [10:15<00:35,  2.57it/s]

Timeout!
Timeout error at : 2308 for model phi3 prompt type Elaborate


Processing posts: 100%|██████████| 2400/2400 [10:35<00:00,  3.78it/s]

Timeout!
Timeout error at : 2392 for model phi3 prompt type Elaborate





On enregistre les modifs dans un csv

In [62]:
df.to_csv(output_file_path,index=False)