### Connect Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Put your path below

In [2]:
!cd '/content/drive/MyDrive/AIISC-Internship/text-based-object-discovery'

In [3]:
PATH = '/content/drive/MyDrive/AIISC-Internship/text-based-object-discovery'

### Install Required Packages

`Stanza`, Stanford NLP Package benefits from `GPU` so enable it under `View Resources > Change runtime type`

In [4]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-332b2fbc-6107-17b8-6477-c27f15772f3c)


In [13]:
%%capture
!pip install transformers
!pip install stanza # for stanford pos tagger
!pip install ftfy regex tqdm
!pip install datasets

### Load Necessary Libraries

We will load the necessary libraries required for generating DAAM outputs for input prompts.

In [14]:
# General
import os
import gc
import json
import time
from tqdm import tqdm

# Plotting
from matplotlib import pyplot as plt

# Data-Handling
import numpy as np
import pandas as pd
from datasets import load_dataset
from pycocotools.coco import COCO

# Model Handling
import torch

# Caption-Processing
from transformers import CLIPTokenizer
from nltk.corpus import stopwords

Download the stopwords for removing stopwords

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
# POS-Tagging
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


### Load Data

Below, we load the `LAION-2B` dataset with the URL and captions in streaming mode to prevent downloading over `350 GB` of data. There is only `train` split available.

In [17]:
dataset = load_dataset('laion/laion2B-en', split='train', streaming=True)



For faster processing, I group the data in batches. Choosing `batch_size=10000`.

In [18]:
BATCH_SIZE = 10000 # SAVE_AFTER = BATCH_SIZE i.e. after processing these many prompts we will save the results.

In [19]:
# For processing data in batches
def group_batch(batch):
  return {k: [v] for k, v in batch.items()}
data = dataset.map(group_batch, batch_size=BATCH_SIZE, batched=True, remove_columns=['SAMPLE_ID', 'URL', 'HEIGHT', 'WIDTH', 'LICENSE', 'NSFW', 'similarity'])

We will look at the captions in the Caption Processing part together with the cleaned captions.

### Caption Processing

Cleaning the prompts. I adopt few ways to clean the prompt:
- Lower Case Conversion (Using the tokenizer that comes with the Diffusion Model)
- Tokenization (Using the tokenizer that comes with the Diffusion Model)
- Remove sentences that exceed the Diffusion Model's Allowed Length (No. of Tokens)
- Remove stop words
- Remove non-alphabets
- Keep only nouns
- Lemmatization (to store the object name)
- Discard any lemma/word with non-alphabet characters. (As `LAION` has lots of noise)

NOTE: You can tweak the following parameters in the `stanza.Pipeline` parameter to use varying amount of Memory.
- `pos_batch_size=6500`
- Set the `DIFFUSION_MODEL_PATH` below to use the tokenizer from it to use for tokenization, allows better alignment and prevents error in alignment

In [20]:
DIFFUSION_MODEL_PATH = 'stabilityai/stable-diffusion-2-base' # Set the model path to load the diffusion model from

# loads the CLIPTokenizer with the configuration same as that used in the Diffusion Model
# Using Stanza Tokenizer might generate different tokens compared to the CLIP, leading to misalignment in DAAM - Causing Error
tokenizer = CLIPTokenizer.from_pretrained(DIFFUSION_MODEL_PATH, subfolder="tokenizer")

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

In [21]:
# loads the text processing pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True, tokenize_pretokenized=True, verbose=True, pos_batch_size=6500)

# treebank-specific POS (XPOS) tags to keep, other POS tagged tokens will not be retained
keep_pos_tags = ['NN', 'NNS', 'NNP', 'NNPS']

# Stopwords
stpwords = set(stopwords.words('english'))

# extract parts of speech
def extract_pos(doc):
  parsed_text = list()
  for sent in doc.sentences:
    parsed_sent = list()
    for wrd in sent.words:
      #extract text and pos
      parsed_sent.append((wrd.text, wrd.xpos))
    parsed_text.append(parsed_sent)
  return parsed_text

# extract lemma
def extract_lemma(doc):
  parsed_text = list()
  for sent in doc.sentences:
    parsed_sent = list()
    for wrd in sent.words:
      # extract text and lemma
      parsed_sent.append((wrd.text, wrd.lemma))
    parsed_text.append(parsed_sent)
  return parsed_text

def clean_prompt(sentences):
  # convert the sentences to lower case and tokenizes the sentences to be passed onto Stanza for POS Tagging
  sentences_lc_tokenized = tokenizer.batch_decode([[word for word in sent[1:-1]] for sent in tokenizer(sentences)['input_ids'] if len(sent) <= tokenizer.model_max_length])

  # stanza accepts only a single string instead of list of strings. So, we have set the tokenize_no_ssplit=True and have to join each sentence with double newline
  sentence_string = "\n\n".join(sentences_lc_tokenized)

  # tokenizes, lemmatizes and pos tags the prompt
  with torch.no_grad():
    processed_prompt = nlp(sentence_string)
  
  # extracts pos tags from the processed_prompt
  pos_tagged_prompt = extract_pos(processed_prompt)

  # lemmatized text
  lemmatized_prompt = extract_lemma(processed_prompt)

  del processed_prompt

  # keep only the noun words, removes stopwords
  fin_prompt = [[word for word, pos_tag in sent if word is not None and ((pos_tag in keep_pos_tags) and (word not in stpwords) and (word.isalpha()))] for sent in pos_tagged_prompt]
  obj_prompt = [[word_lemma[1] for word_pos, word_lemma in zip(sent_pos, sent_lemma) if (word_lemma[0] is not None and word_lemma[1] is not None) and ((word_pos[1] in keep_pos_tags) and ((word_lemma[0] not in stpwords) or (word_lemma[1] not in stpwords)) and word_lemma[0].isalpha() and word_lemma[1].isalpha())] for sent_pos, sent_lemma in zip(pos_tagged_prompt, lemmatized_prompt)]
  
  del pos_tagged_prompt, lemmatized_prompt
  
  return sentences_lc_tokenized, fin_prompt, obj_prompt

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

INFO:stanza:Use device: gpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


An example is shown below for the application of `clean_prompt`.

In [22]:
# Takes quite a bit of time with large batch_size.
t0 = time.time()
examples = next(iter(data))['TEXT']
examples, cleaned, objs = clean_prompt(examples)
t1 = time.time()
pd.DataFrame({
    'tokenized prompts': examples,
    'cleaned prompt': cleaned,
    'objects detected': objs
})

Token indices sequence length is longer than the specified maximum sequence length for this model (162 > 77). Running this sequence through the model will result in indexing errors


Unnamed: 0,original prompts,cleaned prompt,objects detected
0,"blue beach umbrellas, point of rocks, crescent...","[beach, point, crescent, siesta, key, spiral, ...","[beach, point, crescent, siesta, key, spiral, ..."
1,bmw - m 2 - m - performance - dekor - long - b...,"[bmw, performance, dekor, beach]","[bmw, performance, dekor, beach]"
2,becoming more than a good bible study girl : l...,"[bible, study, girl, faith, bible, class, lysa...","[bible, study, girl, faith, bible, class, lysa..."
3,""" dynabrade 5 2 6 3 2 4 - 1 / 2 """" dia. right ...","[dynabrade, angle, center, wheel, grinder]","[dynabrade, angle, center, wheel, grinder]"
4,manette xbox one,"[manette, xbox]","[manette, xbox]"
...,...,...,...
9909,sandbags at the flood closeup photo - stock photo,"[sandbags, flood, closeup, photo, stock, photo]","[sandbag, flood, closeup, photo, stock, photo]"
9910,"jeff bezos to step down as amazon ceo, andy ja...","[jeff, bezos, amazon, andy, jassy, charge]","[jeff, bezo, amazon, andy, jassy, charge]"
9911,custom birthday message cookies,"[birthday, message, cookies]","[birthday, message, cookie]"
9912,swedish alphabet with pictures - learn swedish...,"[alphabet, pictures, bilder, learning, kids, a...","[alphabet, picture, bilder, learning, kid, alp..."


Let's see how much time one batch takes

In [25]:
print(f'Time taken to process and clean 1 batch containing {BATCH_SIZE} prompts:{t1-t0} secs')

Time taken to process and clean 1 batch containing 10000 prompts:50.900819301605225 secs


Cleaning unused objects.

In [26]:
del examples
del cleaned
del objs

NameError: ignored

We start with the training split

The following code removes any `LAION` folder, if present and creates `LAION` folder to store further results.

In [28]:
import shutil # Removes directory if already present! CAREFUL!!!!!!!!!!!!!!!!!!
if os.path.exists(os.path.join(PATH, 'LAION')):
  shutil.rmtree(os.path.join(PATH, 'LAION'))
os.mkdir(os.path.join(PATH, 'LAION'))

In [29]:
print('Starting...')
print('Captions to be processed: ~2320000000')
print('Number of splits: ~232000')

i = 0
for batch in tqdm(data):
  try:
    # Stores the current processed batch
    caption_data_train_file = {'annotations':[]} # For storing results

    # Reset Already occupied Memory and Cache
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_max_memory_cached()
    torch.cuda.empty_cache()

    print()
    print(f'Subset No. {i+1}')
    curr_split_data = batch['TEXT'] # Current split of data
    print('Processing captions...')

    # start processing the train captions subset
    try: # The chances of an error here lies in this portion only
      processed_train = clean_prompt(curr_split_data) 
    except Exception as e:
      print()
      print(f'Encountered Error: {e} in Subset No. {i+1}')
      print('Skipping...')
      continue

    print()
    print(f'Updating captions...')
    # Processing each prompt and updating annotation file for train set
    curr_split_data, cleaned_prompts, object_prompts = processed_train
    update_data = [{'caption': prompt} for prompt in curr_split_data]

    # Garbage Collection
    del curr_split_data, processed_train
    gc.collect()

    for idx, prompt in enumerate(zip(cleaned_prompts, object_prompts)):
      cleaned, objects = prompt # Process prompt
      # update files and object list
      update_data[idx]['cleaned'] = cleaned
      update_data[idx]['objects'] = objects
    
    del cleaned,objects, cleaned_prompts, object_prompts 

    # Display Some Info
    print()
    print()
    print('***INFO***')
    print('Captions Processed:', BATCH_SIZE * (i+1))

    caption_data_train_file['annotations'] = update_data # updating the data for saving
    print('Saving...', end='')

    del update_data

    # Save the processed captions data so far
    with open(os.path.join(PATH, f'LAION/train-captions-processed-{i}.json'), 'w') as outfile: # Save Results in json
      outfile.write(json.dumps(caption_data_train_file, indent=4))

    del caption_data_train_file

    print('Saved.')
    i += 1
  except KeyboardInterrupt:
    print('Interrupted...')
    print(f'Saving... Current Subset No. {i+1}')
    # Save the processed captions data so far
    with open(os.path.join(PATH, f'LAION/train-captions-processed-{i}.json'), 'w') as outfile: # Save Results in json
      outfile.write(json.dumps(caption_data_train_file, indent=4))
    break
    
print('Done!')

Starting...
Captions to be processed: ~2320000000
Number of splits: ~232000


0it [00:00, ?it/s]


Subset No. 1
Processing captions...

Updating captions...


1it [00:45, 45.74s/it]



***INFO***
Captions Processed: 10000
Saving...Saved.

Subset No. 2
Processing captions...


1it [00:58, 58.18s/it]

Interrupted...
Saving... Current Subset No. 2
Done!



