# Extract Knowledge from ChatGPT (Azure for example)

### Package Install

In [None]:
!pip install -q openai
!pip install -q salesforce-lavis
!pip install -q easyocr

In [6]:
import pandas as pd
import torch
from tqdm import tqdm

### Read Data

In [2]:
#data=pd.read_pickle('/data/twitter.pkl')
data = pd.DataFrame() #given a sample here
event = [' 2020 Summer Olympics','2014 Hong Kong protests']
image = ['/data/dataset/datakey/images/2014 Hong Kong protests/94073.jpg','/data/dataset/datakey/images/2020 Summer Olympics/29365.jpg']
text = ['Only this side of Lennon Wall left #LennonWallHK',"Jeev Milkha Singh emotionally thanks Neeraj Chopra for fulfilling his Father's wish of getting a #Gold in #Athetics at the 🙏 #Tokyo2020 #Cheer4India #TeamIndia #IND"]
data['event'] = event
data['image'] = image
data['text'] = text

In [12]:
data

Unnamed: 0,event,image,text,caption
0,2020 Summer Olympics,/data/dataset/datakey/images/2014 Hong Kong pr...,Only this side of Lennon Wall left #LennonWallHK,a man and a woman standing in front of a wall ...
1,2014 Hong Kong protests,/data/dataset/datakey/images/2020 Summer Olymp...,Jeev Milkha Singh emotionally thanks Neeraj Ch...,a man sitting on top of a bench holding a medal


### Extract Caption from Image 

In [None]:
from lavis.models import load_model_and_preprocess
from PIL import Image

In [None]:
# setup device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, vis_processors, _ = load_model_and_preprocess(name="blip_caption", model_type="base_coco", is_eval=True, device=device)

In [10]:
BATCH_SIZE = 5 #Using batch
caption_list=[]
def generate_captions(model, vis_processors, images_path_list) -> torch.Tensor():

    global BATCH_SIZE
    pbar = tqdm(total = len(images_path_list)+1)
    while len(images_path_list) != 0:
        if len(images_path_list) > BATCH_SIZE:
            tmp_img_path_list = images_path_list[:BATCH_SIZE]
            images_path_list = images_path_list[BATCH_SIZE:]
            pbar.update(BATCH_SIZE)
        else:
            tmp_img_path_list = images_path_list
            images_path_list = []
        image_list = [Image.open(image_path).convert('RGB') for image_path in tmp_img_path_list]
        image_list = [vis_processors["eval"](image).unsqueeze(0).to(device) for image in image_list]
        batch_image = torch.cat(image_list, dim=0)
        # first generate the object as a part of the prompt
        caption = model.generate({"image": batch_image}) #, "prompt": [prompt for _ in range(batch_image.shape[0])]
        caption_list.extend(caption)
  
    return caption_list

In [None]:
caption_list = generate_captions(model, vis_processors, list(data['image']))
data['caption'] = caption_list

### Extract OCR from Image 

In [None]:
import easyocr

In [None]:
reader = easyocr.Reader(['en'])

In [None]:
ocr_list=[]
for i in tqdm(range(len(data))):
    result = reader.readtext(data['image'].iloc[i],detail=0,paragraph=1, batch_size = 8)#, detail = 0
    ocr_list.append(result)

In [16]:
data['ocr'] = ocr_list
data['ocr'] = data['ocr'].map(lambda x: x[0] if x else '')

In [17]:
data

Unnamed: 0,event,image,text,caption,ocr
0,2020 Summer Olympics,/data/dataset/datakey/images/2014 Hong Kong pr...,Only this side of Lennon Wall left #LennonWallHK,a man and a woman standing in front of a wall ...,Ball nuJnnk CALL;
1,2014 Hong Kong protests,/data/dataset/datakey/images/2020 Summer Olymp...,Jeev Milkha Singh emotionally thanks Neeraj Ch...,a man sitting on top of a bench holding a medal,IVDIA


## Implicit Knowledge Extraction

### 1) Obtaining an API key from OpenAI (Azure https://azure.microsoft.com/en-us/products/ai-services/openai-service)

In [20]:
import openai

In [None]:
openai.api_key = 'xx'
openai.api_base = 'https://YOUR_RESOURCE_NAME.openai.azure.com/' # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
openai.api_type = 'azure'
openai.api_version = "2023-08-01-preview" # this may change in the future

### 2) Formulating an appropriate prompt

In [23]:
prompt_list="Context:"+data['text']+'\nCaption:'+data['caption'].iloc[i]+'\nOCR:'+data['ocr'].iloc[i]+'\n'+\
"Question: What's the news event occur?\nAnswer:"
prompt_list

0    Context:Only this side of Lennon Wall left #Le...
1    Context:Jeev Milkha Singh emotionally thanks N...
Name: text, dtype: object

### 3) Extracting generated answers

In [58]:
import nest_asyncio
from aiohttp import ClientSession
import asyncio
from tqdm import tqdm
nest_asyncio.apply()

In [74]:
async def async_completion(prompt,progress_bar):
    progress_bar.update(1)
    try:
        response = await openai.ChatCompletion.acreate(
            engine='xxx',#according to your azure
            messages=[
                {"role": "assistant", "content": prompt},
            ],
            temperature=0,#the result will be same
        )
        output=response['choices'][0]['message']['content']
    except:
        output='Failed' # !!!!api error: should check and retry if contain!!!!!
    return output

In [None]:
# Using asyncio for concurrent requests: speed up the api (should lower than the limitation from OpenAI or Azure)
SEM_LIMIT = 30  # Number of concurrent requests allowed at the same time
RPM_LIMIT = 500  # Requests per minute limit

response_list=[]
progress_bar = tqdm(total=len(data))
sem = asyncio.Semaphore(SEM_LIMIT)
async with ClientSession() as session:
    openai.aiosession.set(session)
    tasks = []
    for i in range(0,len(data)):
        async with sem:
            task = asyncio.ensure_future(async_completion(prompt_list[i], progress_bar))
            tasks.append(task)
            if len(tasks) % SEM_LIMIT == 0:
                response=await asyncio.gather(*tasks)
                response_list.extend(response)
                tasks = []
                await asyncio.sleep(60 / RPM_LIMIT)  # Introducing delays
    if tasks:
        response = await asyncio.gather(*tasks)
        response_list.extend(response)
    await openai.aiosession.get().close()

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.ensure_future(async_completion(prompt_list[i], progress_bar)))

In [90]:
data['answer']=response_list

### 4) Extracting generated explanations

In [79]:
prompt_list="Context:"+data['text']+'\nCaption:'+data['caption'].iloc[i]+'\nOCR:'+data['ocr'].iloc[i]+'\n'+\
"Question: What's the news event occur?\nAnswer:"+data['answer'].iloc[i]+'\nThis is because'

In [None]:
response_list=[]
progress_bar = tqdm(total=len(data))
sem = asyncio.Semaphore(SEM_LIMIT)
async with ClientSession() as session:
    openai.aiosession.set(session)
    tasks = []
    for i in range(0,len(data)):
        async with sem:
            task = asyncio.ensure_future(async_completion(prompt_list[i], progress_bar))
            tasks.append(task)
            if len(tasks) % SEM_LIMIT == 0:
                response=await asyncio.gather(*tasks)
                response_list.extend(response)
                tasks = []
                await asyncio.sleep(60 / RPM_LIMIT)  # Introducing delays
    if tasks:
        response = await asyncio.gather(*tasks)
        response_list.extend(response)
    await openai.aiosession.get().close()

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.ensure_future(async_completion(prompt_list[i], progress_bar)))

In [88]:
data['gpt']=data['answer']+response_list
data

Unnamed: 0,event,image,text,caption,ocr,answer,gpt
0,2020 Summer Olympics,/data/dataset/datakey/images/2014 Hong Kong pr...,Only this side of Lennon Wall left #LennonWallHK,a man and a woman standing in front of a wall ...,Ball nuJnnk CALL;,The news event is about the remaining side of ...,The news event is about the remaining side of ...
1,2014 Hong Kong protests,/data/dataset/datakey/images/2020 Summer Olymp...,Jeev Milkha Singh emotionally thanks Neeraj Ch...,a man sitting on top of a bench holding a medal,IVDIA,The news event is about Jeev Milkha Singh emot...,The news event is about Jeev Milkha Singh emot...


### 5) Utilizing the BERT model for feature extraction

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

In [None]:
def extract_text_features(text_list, max_length=200, batch_size=16):
    encoded_inputs = tokenizer(text_list, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)

    # batch
    num_samples = len(text_list)
    features = []
    for i in tqdm(range(0, num_samples, batch_size)):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]

        # extract feature
        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            last_layer_features = outputs.last_hidden_state
            pooled_output = torch.mean(last_layer_features, dim=1)

        features.extend(pooled_output.cpu().detach().numpy().tolist())

    return features

In [None]:
gpt_feature = extract_text_features(list(data['gpt']))
data['gpt_feature']= gpt_feature

In [None]:
#save data
data.to_pickle('/data/twitter_gpt.pkl')