In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/capstone-2')

In [3]:
import json
import requests
from PIL import Image
import pickle
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [9]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import random_split, DataLoader
from transformers import AutoProcessor, AutoTokenizer
import pickle

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

Downloading the Hugging face Instruct150K data

https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K

In [5]:
# Opening JSON file - instruct150k
f = open('llava_instruct_150k.json')

# returns JSON object as
# a dictionary
data = json.load(f)
data[0]

{'id': '000000033471',
 'image': '000000033471.jpg',
 'conversations': [{'from': 'human',
   'value': '<image>\nWhat are the colors of the bus in the image?'},
  {'from': 'gpt', 'value': 'The bus in the image is white and red.'},
  {'from': 'human',
   'value': 'What feature can be seen on the back of the bus?'},
  {'from': 'gpt', 'value': 'The back of the bus features an advertisement.'},
  {'from': 'human',
   'value': 'Is the bus driving down the street or pulled off to the side?'},
  {'from': 'gpt',
   'value': 'The bus is driving down the street, which is crowded with people and other vehicles.'}]}

Preparing Data

In [6]:
# create input pickle file by flattening the data
data_instruct150_flatten = []
r = 0

for a_idx,d in enumerate(data):
    image = d['image']
    image_url = 'http://images.cocodataset.org/train2017/' + image
    conv_iter = iter( d['conversations'])
    for i in conv_iter:
      gpt_ans = next(conv_iter)
      if len(gpt_ans['value']) > 100: # filter out too long answers
          continue
      if i['from'] == 'human' and gpt_ans['from'] == 'gpt':
        data_instruct150_flatten.append((image_url, i['value'].replace('<image>\n','').replace('<image>',''),gpt_ans['value']))

    if a_idx % 50000 == 0:
      print(f"{50000 * r} processed")
      r += 1

0 processed
50000 processed
100000 processed
150000 processed


In [7]:
data_instruct150_flatten[2]

('http://images.cocodataset.org/train2017/000000033471.jpg',
 'Is the bus driving down the street or pulled off to the side?',
 'The bus is driving down the street, which is crowded with people and other vehicles.')

In [10]:
clip_model_name = "wkcn/TinyCLIP-ViT-61M-32-Text-29M-LAION400M"
phi_model_name  = "microsoft/phi-2"
tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
# gpt like training dataset
with open('train_tokens.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([['img_url','input','label']])

train_data_temp = []
r = 1
for df in data_instruct150_flatten:
  image_url = df[0]
  image_q   = df[1]
  image_a   = df[2]

  # tokenise
  ques_token = tokenizer(image_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0)
  ans_token  = tokenizer(image_a, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0)

  context_length = len(ques_token)
  combo_q_a = torch.cat( [ques_token,ans_token])

  for al in range(len(ans_token)):
    input = combo_q_a[al : al + context_length].numpy()
    label = combo_q_a[al + 1 : al + context_length + 1].numpy()
    train_data_temp.append([image_url,input,label])
    if len(train_data_temp) >= 1: # write to the file
       print(f"Writing to disk after {r} rows")
       r += 1
       with open('train_tokens.csv', 'a', newline='') as file:
          writer = csv.writer(file)
          writer.writerows(train_data_temp)
       train_data_temp = []

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Writing to disk after 186321000000 rows
Writing to disk after 186321100000 rows
Writing to disk after 186321200000 rows
Writing to disk after 186321300000 rows
Writing to disk after 186321400000 rows
Writing to disk after 186321500000 rows
Writing to disk after 186321600000 rows
Writing to disk after 186321700000 rows
Writing to disk after 186321800000 rows
Writing to disk after 186321900000 rows
Writing to disk after 186322000000 rows
Writing to disk after 186322100000 rows
Writing to disk after 186322200000 rows
Writing to disk after 186322300000 rows
Writing to disk after 186322400000 rows
Writing to disk after 186322500000 rows
Writing to disk after 186322600000 rows
Writing to disk after 186322700000 rows
Writing to disk after 186322800000 rows
Writing to disk after 186322900000 rows
Writing to disk after 186323000000 rows
Writing to disk after 186323100000 rows
Writing to disk after 186323200000 rows
Writing to disk

In [29]:
df_data = pd.read_csv('train_tokens.csv',on_bad_lines='warn')

df_data.shape

Skipping line 414344: expected 3 fields, saw 4



(1868184, 3)

In [26]:
df_data = df_data[:-1000]

In [30]:
df_data_correct = df_data.drop(414344,axis=0)

In [35]:
df_data_correct.iloc[414344]

img_url    http://images.cocodataset.org/train2017/000000...
input      [ 319  262 7480   30 1858  389  734 3024 6844 ...
label      [ 262 7480   30 1858  389  734 3024 6844  319 ...
Name: 414345, dtype: object

In [31]:
df_data_correct.to_csv('final_train_tokens.csv', index= False)

In [32]:
df_data = pd.read_csv('final_train_tokens.csv',on_bad_lines='warn')

In [33]:
df_data.shape

(1868183, 3)

In [None]:
df_data_1 = pd.read_csv('final_train_tokens.csv',on_bad_lines='skip')

df_data_1.shape

Preparing Validation data





In [20]:
data_instruct150_sample_val_flatten = data_instruct150_flatten[-1000:]

In [21]:
with open('sample_val_data.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([['img_url','q','a']])

with open('sample_val_data.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data_instruct150_sample_val_flatten)

In [22]:
df_val_data = pd.read_csv('sample_val_data.csv')

df_val_data.shape

(1000, 3)

In [23]:
df_val_data.head(2)

Unnamed: 0,img_url,q,a
0,http://images.cocodataset.org/train2017/000000...,What are the colors of the dog in the image?,The dog in the image has black and brown fur.
1,http://images.cocodataset.org/train2017/000000...,What animals are present in the image?\n,The image shows a group of black and brown she...
