### Connect Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Put your path below

In [2]:
!cd '/content/drive/MyDrive/AIISC-Internship/text-based-object-discovery'

In [3]:
PATH = '/content/drive/MyDrive/AIISC-Internship/text-based-object-discovery'

### Install Required Packages

`Stanza`, Stanford NLP Package benefits from `GPU` so enable it under `View Resources > Change runtime type`

In [4]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-e828474e-b398-61fc-5443-1c2fc67b6ddb)


In [5]:
%%capture
!pip install stanza # for stanford pos tagger
!pip install ftfy regex tqdm
!pip install datasets

### Load Necessary Libraries

We will load the necessary libraries required for generating DAAM outputs for input prompts.

In [6]:
# General
import os
import gc
import json
import time
from tqdm import tqdm

# Plotting
from matplotlib import pyplot as plt

# Data-Handling
import numpy as np
import pandas as pd
from datasets import load_dataset
from pycocotools.coco import COCO

# Model Handling
import torch

# Caption-Processing
from nltk.corpus import stopwords

Download the stopwords for removing stopwords

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
# POS-Tagging
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


### Load Data

Below, we load the `LAION-2B` dataset with the URL and captions in streaming mode to prevent downloading over `350 GB` of data. There is only `train` split available.

In [9]:
dataset = load_dataset('laion/laion2B-en', split='train', streaming=True)

Downloading readme:   0%|          | 0.00/30.0 [00:00<?, ?B/s]



For faster processing, I group the data in batches. Choosing `batch_size=10000`.

In [10]:
BATCH_SIZE = 10000 # SAVE_AFTER = BATCH_SIZE i.e. after processing these many prompts we will save the results.

In [11]:
# For processing data in batches
def group_batch(batch):
  return {k: [v] for k, v in batch.items()}
data = dataset.map(group_batch, batch_size=BATCH_SIZE, batched=True, remove_columns=['SAMPLE_ID', 'URL', 'HEIGHT', 'WIDTH', 'LICENSE', 'NSFW', 'similarity'])

We will look at the captions in the Caption Processing part together with the cleaned captions.

### Caption Processing

Cleaning the prompts. I adopt few ways to clean the prompt:
- Lower Case Conversion
- Tokenization
- Remove stop words
- Remove non-alphabets
- Keep only nouns
- Lemmatization (to store the object name)
- Discard any lemma/word with non-alphabet characters. (As `LAION` has lots of noise)

NOTE: You can tweak the following parameters in the `stanza.Pipeline` parameter to use varying amount of Memory.
- `pos_batch_size=6500`

In [23]:
# loads the text processing pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True, verbose=True, pos_batch_size=6500)

# treebank-specific POS (XPOS) tags to keep, other POS tagged tokens will not be retained
keep_pos_tags = ['NN', 'NNS', 'NNP', 'NNPS']

# Stopwords
stpwords = set(stopwords.words('english'))

# extract parts of speech
def extract_pos(doc):
  parsed_text = list()
  for sent in doc.sentences:
    parsed_sent = list()
    for wrd in sent.words:
      #extract text and pos
      parsed_sent.append((wrd.text, wrd.xpos))
    parsed_text.append(parsed_sent)
  return parsed_text

# extract lemma
def extract_lemma(doc):
  parsed_text = list()
  for sent in doc.sentences:
    parsed_sent = list()
    for wrd in sent.words:
      # extract text and lemma
      parsed_sent.append((wrd.text, wrd.lemma))
    parsed_text.append(parsed_sent)
  return parsed_text

def clean_prompt(sentences):
  # convert the sentences to lower case
  sentences_lc = [sentence.lower() for sentence in sentences]

  # stanza accepts only a single string instead of list of strings. So, we have set the tokenize_no_ssplit=True and have to join each sentence with double newline
  sentence_string = "\n\n".join(sentences_lc)

  # tokenizes, lemmatizes and pos tags the prompt
  with torch.no_grad():
    processed_prompt = nlp(sentence_string)
  
  # extracts pos tags from the processed_prompt
  pos_tagged_prompt = extract_pos(processed_prompt)

  # lemmatized text
  lemmatized_prompt = extract_lemma(processed_prompt)

  del processed_prompt

  # keep only the noun words, removes stopwords
  fin_prompt = [[word for word, pos_tag in sent if word is not None and ((pos_tag in keep_pos_tags) and (word not in stpwords) and (word.isalpha()))] for sent in pos_tagged_prompt]
  obj_prompt = [[word_lemma[1] for word_pos, word_lemma in zip(sent_pos, sent_lemma) if (word_lemma[0] is not None and word_lemma[1] is not None) and ((word_pos[1] in keep_pos_tags) and ((word_lemma[0] not in stpwords) or (word_lemma[1] not in stpwords)) and word_lemma[0].isalpha() and word_lemma[1].isalpha())] for sent_pos, sent_lemma in zip(pos_tagged_prompt, lemmatized_prompt)]
  
  del pos_tagged_prompt, lemmatized_prompt
  
  return fin_prompt, obj_prompt

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

INFO:stanza:Use device: gpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


An example is shown below for the application of `clean_prompt`.

In [13]:
# Takes quite a bit of time with large batch_size.
t0 = time.time()
examples = next(iter(data))['TEXT']
cleaned, objs = clean_prompt(examples)
t1 = time.time()
pd.DataFrame({
    'original prompts': examples,
    'cleaned prompt': cleaned,
    'objects detected': objs
})

Unnamed: 0,original prompts,cleaned prompt,objects detected
0,"Blue Beach Umbrellas, Point Of Rocks, Crescent...","[beach, umbrellas, point, rocks, crescent, bea...","[beach, umbrella, point, rock, crescent, beach..."
1,BMW-M2-M-Performance-Dekor-Long-Beach-Blue-05,[],[]
2,Becoming More Than a Good Bible Study Girl: Li...,"[bible, study, girl, faith, bible, class, lysa...","[bible, study, girl, faith, bible, class, lysa..."
3,"""Dynabrade 52632 4-1/2"""" Dia. Right Angle Depr...","[dia, angle, center, wheel, grinder]","[dia, angle, center, wheel, grinder]"
4,MANETTE XBOX ONE,"[manette, xbox]","[manette, xbox]"
...,...,...,...
9995,Sandbags at the flood closeup photo - stock photo,"[sandbags, flood, closeup, photo, stock, photo]","[sandbag, flood, closeup, photo, stock, photo]"
9996,"Jeff Bezos to step down as Amazon CEO, Andy Ja...","[jeff, bezos, amazon, ceo, andy, jassy, charge]","[jeff, bezo, amazon, ceo, andy, jassy, charge]"
9997,Custom Birthday Message Cookies,"[birthday, message, cookies]","[birthday, message, cookie]"
9998,Swedish alphabet with pictures - Learn swedish...,"[alphabet, pictures, bilder, learning, letters...","[alphabet, picture, bilder, learning, letter, ..."


Let's see how much time one batch takes

In [14]:
print(f'Time taken to process and clean 1 batch containing {BATCH_SIZE} prompts:{t1-t0} secs')

Time taken to process and clean 1 batch containing 10000 prompts:71.63415956497192


Cleaning unused objects.

In [15]:
del examples
del cleaned
del objs

We start with the training split

The following code removes any `LAION` folder, if present and creates `LAION` folder to store further results.

In [19]:
import shutil # Removes directory if already present! CAREFUL!!!!!!!!!!!!!!!!!!
if os.path.exists(os.path.join(PATH, 'LAION')):
  shutil.rmtree(os.path.join(PATH, 'LAION'))
os.mkdir(os.path.join(PATH, 'LAION'))

In [24]:
print('Starting...')
print('Captions to be processed: ~2320000000')
print('Number of splits: ~232000')

i = 0
for batch in tqdm(data):
  try:
    # Stores the current processed batch
    caption_data_train_file = {'annotations':[]} # For storing results

    # Reset Already occupied Memory and Cache
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_max_memory_cached()
    torch.cuda.empty_cache()

    print()
    print(f'Subset No. {i+1}')
    curr_split_data = batch['TEXT'] # Current split of data
    print('Processing captions...')

    # start processing the train captions subset
    try: # The chances of an error here lies in this portion only
      processed_train = clean_prompt(curr_split_data) 
    except Exception as e:
      print()
      print(f'Encountered Error: {e} in Subset No. {i+1}')
      print('Skipping...')
      continue

    print()
    print(f'Updating captions...')
    # Processing each prompt and updating annotation file for train set
    update_data = [{'caption': prompt} for prompt in curr_split_data]
    cleaned_prompts, object_prompts = processed_train

    # Garbage Collection
    del curr_split_data, processed_train
    gc.collect()

    for idx, prompt in enumerate(zip(cleaned_prompts, object_prompts)):
      cleaned, objects = prompt # Process prompt
      # update files and object list
      update_data[idx]['cleaned'] = cleaned
      update_data[idx]['objects'] = objects
    
    del cleaned,objects, cleaned_prompts, object_prompts 

    # Display Some Info
    print()
    print()
    print('***INFO***')
    print('Captions Processed:', BATCH_SIZE * (i+1))

    caption_data_train_file['annotations'] = update_data # updating the data for saving
    print('Saving...', end='')

    del update_data

    # Save the processed captions data so far
    with open(os.path.join(PATH, f'LAION/train-captions-processed-{i}.json'), 'w') as outfile: # Save Results in json
      outfile.write(json.dumps(caption_data_train_file, indent=4))

    del caption_data_train_file

    print('Saved.')
    i += 1
  except KeyboardInterrupt:
    print('Interrupted...')
    print(f'Saving... Current Subset No. {i+1}')
    # Save the processed captions data so far
    with open(os.path.join(PATH, f'LAION/train-captions-processed-{i}.json'), 'w') as outfile: # Save Results in json
      outfile.write(json.dumps(caption_data_train_file, indent=4))
    break
    
print('Done!')

Starting...
Captions to be processed: ~2320000000
Number of splits: ~232000





Subset No. 1
Processing captions...


1it [00:51, 51.54s/it]

Updating captions...


***INFO***
Captions Processed: 10000
Saving...Saved.

Subset No. 2
Processing captions...


2it [01:41, 50.74s/it]

Updating captions...


***INFO***
Captions Processed: 20000
Saving...Saved.

Subset No. 3
Processing captions...


3it [02:30, 50.03s/it]

Updating captions...


***INFO***
Captions Processed: 30000
Saving...Saved.

Subset No. 4
Processing captions...


4it [03:18, 49.24s/it]

Updating captions...


***INFO***
Captions Processed: 40000
Saving...Saved.

Subset No. 5
Processing captions...


5it [04:09, 49.75s/it]

Updating captions...


***INFO***
Captions Processed: 50000
Saving...Saved.

Subset No. 6
Processing captions...


6it [04:57, 49.14s/it]

Updating captions...


***INFO***
Captions Processed: 60000
Saving...Saved.

Subset No. 7
Processing captions...


7it [05:45, 48.91s/it]

Updating captions...


***INFO***
Captions Processed: 70000
Saving...Saved.

Subset No. 8
Processing captions...


8it [06:35, 49.14s/it]

Updating captions...


***INFO***
Captions Processed: 80000
Saving...Saved.

Subset No. 9
Processing captions...


8it [06:51, 51.49s/it]


KeyboardInterrupt: ignored