In [1]:
'''
This script will read in clustered taxi ride data from the clustered_data_{date}.json file and then 
use the OpenAI API to generate a summary of the text where the clustering returned a label of '-1' (i.e an outlier).

Once the summary is generated, it will be saved to a file called 'clustered_summarized_{date}.json' in the same AWS S3 bucket.

The textual data to be summarized is in the 'traffic', 'weather' and 'news' columns of the dataframe.

The prompt will be created using Langchain and will have the following format:

"
The following information describes conditions relevant to taxi journeys through a single day in Glasgow, Scotland.

News: {df['news'][i]}
Weather: {df['weather'][i]}
Traffic: {df['traffic'][i]}

Summarise the above information in 3 sentences or less.
"

The returned text will then be added to the pandas dataframe as df["summary"] and then saved to the clustered_summarized_{date}.json file in AWS S3.
'''

from utils.extractor import Extractor
from textwrap import dedent
import datetime
import openai
import boto3
import os

openai.api_key = os.environ['OPENAI_API_KEY']

class LLMSummarizer:
    def __init__(self, bucket_name: str, file_name: str) -> None:
        self.bucket_name = bucket_name
        self.file_name = file_name

    def summarize(self) -> None:
        extractor = Extractor(self.bucket_name, self.file_name)
        df = extractor.extract_data()
        df['summary'] = ''
        for i in range(len(df)):
            if df.loc[i]['label'] == -1:
                prompt = dedent(f"""
                The following information describes conditions relevant to taxi journeys through a single day in Glasgow, Scotland.

                News: {df.loc[i]['news']}
                Weather: {df.loc[i]['weather']}
                Traffic: {df.loc[i]['traffic']}

                Summarise the above information in 3 sentences or less.
                """)
                df.loc[i]['summary'] = self.generate_summary(prompt)
        date = datetime.datetime.now().strftime("%Y%m%d")
        boto3.client('s3').put_object(
            Body=df.to_json(orient='records'), 
            Bucket=self.bucket_name, 
            Key=f"clustered_summarized_{date}.json"
        )
    
    def format_prompt(self, news: str, weather: str, traffic: str):
        prompt = dedent(f'''
                        The following information describes conditions relevant to taxi journeys through a single day in Glasgow, Scotland.

                        News: {news}
                        Weather: {weather}
                        Traffic: {traffic}

                        Summarise the above information in 3 sentences or less.
                        ''')
        return prompt

    def generate_summary(self, prompt: str) -> str:
        response = openai.ChatCompletion.create(
            model = "gpt-3.5-turbo",
            temperature = 0.3,
            messages = [{"role": "user", "content": prompt}]
        )
        return response.choices[0].message['content']
        # response = openai.Completion.create(
        #     engine="davinci",
        #     prompt=prompt,
        #     temperature=0.3,
        #     max_tokens=60,
        #     top_p=1.0,
        #     frequency_penalty=0.0,
        #     presence_penalty=0.0,
        #     stop=["\n"]
        # )
        #return response['choices'][0]['text']

KeyError: 'OPENAI_API_KEY'

In [6]:
from tests.test_config import test_config
date = datetime.datetime.now().strftime("%Y%m%d")

extractor = Extractor(test_config['bucket_name'], f"clustered_data_{date}.json")
df = extractor.extract_data()
df['summary'] = ''

In [8]:
df[df['label']==-1]

Unnamed: 0,ride_dist,ride_time,ride_speed,ride_id,selection_idx,news,weather,traffic,label,summary
85,5.429792,0.355478,15.274618,2023052485,1,It is expected to be a busy shopping day today...,The weather is expected to be sunny and dry ov...,Traffic is expected to be heavy on the M8 moto...,-1,
135,7.279135,0.446765,16.292967,20230524135,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,-1,
151,4.651962,0.317474,14.653052,20230524151,2,Economic conditions are slowly improving for t...,The forecast for the Greater Glasgow Area toda...,Traffic is expected to be normal today in the ...,-1,
174,5.815598,0.395991,14.686175,20230524174,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,-1,
200,9.801099,0.432704,22.650809,20230524200,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,-1,
280,9.607755,0.511916,18.768209,20230524280,2,Economic conditions are slowly improving for t...,The forecast for the Greater Glasgow Area toda...,Traffic is expected to be normal today in the ...,-1,
286,9.680332,0.470108,20.591705,20230524286,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,-1,
370,28.660163,0.813905,35.213149,20230524370,2,Economic conditions are slowly improving for t...,The forecast for the Greater Glasgow Area toda...,Traffic is expected to be normal today in the ...,-1,
371,26.102442,0.844849,30.895986,20230524371,1,It is expected to be a busy shopping day today...,The weather is expected to be sunny and dry ov...,Traffic is expected to be heavy on the M8 moto...,-1,
372,23.208065,0.945982,24.533292,20230524372,2,Economic conditions are slowly improving for t...,The forecast for the Greater Glasgow Area toda...,Traffic is expected to be normal today in the ...,-1,


In [14]:
def format_prompt(news, weather, traffic):
    prompt = dedent(f"""
                    The following information describes conditions relevant to taxi journeys through a single day in Glasgow, Scotland.

                    News: {news}
                    Weather: {weather}
                    Traffic: {traffic}

                    Summarise the above information in 3 sentences or less.
                    """)
    return prompt

def generate_summary(prompt: str) -> str:
    response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        temperature = 0.3,
        messages = [{"role": "user", "content": prompt}]
    )
    return response.choices[0].message['content']

df[df['label']==-1]['summary'] = df[df['label']==-1].apply(lambda x: generate_summary(format_prompt(x['news'], x['weather'], x['traffic'])), axis=1)

RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 3a8aae543e1cffe7498e0d8d3b172f18 in your message.)

In [23]:
df.loc[df['label']==-1].apply(lambda x: format_prompt(x['news'], x['weather'], x['traffic']), axis=1)

85     \nThe following information describes conditio...
135    \nThe following information describes conditio...
151    \nThe following information describes conditio...
174    \nThe following information describes conditio...
200    \nThe following information describes conditio...
280    \nThe following information describes conditio...
286    \nThe following information describes conditio...
370    \nThe following information describes conditio...
371    \nThe following information describes conditio...
372    \nThe following information describes conditio...
373    \nThe following information describes conditio...
374    \nThe following information describes conditio...
376    \nThe following information describes conditio...
378    \nThe following information describes conditio...
379    \nThe following information describes conditio...
388    \nThe following information describes conditio...
391    \nThe following information describes conditio...
394    \nThe following informat

In [24]:
df.loc[df['label']==-1, 'summary'] = 'test summary'

In [25]:
df

Unnamed: 0,ride_dist,ride_time,ride_speed,ride_id,selection_idx,news,weather,traffic,label,summary
0,3.216267,0.111048,28.962835,202305240,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,0,
1,1.299590,0.059145,21.973118,202305241,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,0,
2,1.865692,0.076264,24.463596,202305242,2,Economic conditions are slowly improving for t...,The forecast for the Greater Glasgow Area toda...,Traffic is expected to be normal today in the ...,0,
3,4.170645,0.210408,19.821738,202305243,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,0,
4,0.339891,0.010708,31.740434,202305244,2,Economic conditions are slowly improving for t...,The forecast for the Greater Glasgow Area toda...,Traffic is expected to be normal today in the ...,0,
...,...,...,...,...,...,...,...,...,...,...
395,0.793904,0.081639,9.724565,20230524395,1,It is expected to be a busy shopping day today...,The weather is expected to be sunny and dry ov...,Traffic is expected to be heavy on the M8 moto...,0,
396,2.947778,0.235698,12.506576,20230524396,1,It is expected to be a busy shopping day today...,The weather is expected to be sunny and dry ov...,Traffic is expected to be heavy on the M8 moto...,-1,test summary
397,4.128424,0.200634,20.576911,20230524397,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,0,
398,9.338724,1.334156,6.999724,20230524398,0,Reports are that there has been an accident on...,The forecast for the West of Scotland over the...,There is a traffic jam on the M8 motorway near...,-1,test summary
