In [1]:
import pandas as pd 

from googletrans import Translator
translator = Translator()
from langdetect import detect

import nltk
import unidecode

from tqdm.notebook import tqdm
import os
import requests

In [2]:
def clean_text(s):
    return s.str.lower().str.replace('[','').str.replace(']','').str.replace('\'','').apply(lambda x: unidecode.unidecode(x))

In [3]:
def rm_tokens(tokens_to_remove,s):
    words = s.replace(',','').split(sep=' ')
    result = ""
    for w in words:
        if not(w in tokens_to_remove):
            result+=str(w+" ")
    return result

In [26]:
def getContext(query):
    
    url = "https://api.duckduckgo.com/"
    params = {
        "q":query,
        "format":"json",
        "pretty":1
    }
    abstract = ""
    heading = ""
    topic = ""

    try:
        x = requests.get(url, params=params)
        if x.status_code == 200:
            data = x.json()
            abtract = data['AbstractText']
            topic = data['Heading']
            if len(data["RelatedTopics"]) > 0:
                topic = data["RelatedTopics"][0]['Text'] 

    except:
        return "", "", ""

    return abstract,heading,topic

In [5]:
vision = pd.read_csv('../data/external/google_vision.csv',index_col=0)
vision.fillna('',inplace=True)

In [6]:
print(vision.shape)
vision.head(3)

(12140, 7)


Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions
0,img/16395.png,['bethany hamilton'],"['', 'Bethany Hamilton']","['Person', 'Clothing']","['Chin', 'Long hair', 'Photo caption', 'Face',...","['handjobs', 'sold seperatelý', '']",['joy']
1,img/50162.png,['diatonic button accordion'],['Musician'],"['Person', 'Microphone']","['Free reed aerophone', 'Music artist', 'Music...","['roses are black,', 'violets are black', 'EXS...",[]
2,img/70691.png,['watermelon'],"['Fruit', 'Melon', 'Watermelon']",['Watermelon'],"['Cucumber, gourd, and melon family', 'Waterme...","['overdose', '']",[]


In [7]:
# CLEAN
colnames = vision.columns
for col in colnames:
    vision[[col]] = vision[[col]].apply(lambda x: clean_text(x))
vision.head(3)

Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions
0,img/16395.png,bethany hamilton,", bethany hamilton","person, clothing","chin, long hair, photo caption, face, nose, ey...","handjobs, sold seperately,",joy
1,img/50162.png,diatonic button accordion,musician,"person, microphone","free reed aerophone, music artist, musician, p...","roses are black,, violets are black, exs, ever...",
2,img/70691.png,watermelon,"fruit, melon, watermelon",watermelon,"cucumber, gourd, and melon family, watermelon,...","overdose,",


In [8]:
# TRANSLATE
vision.fillna("",inplace=True)
result = []

for i in tqdm(range(0,vision.shape[0]),total=vision.shape[0]):
    s = vision.best_guess_labels[i]
    try:
        code = detect(s)
    except:
        code = 'unk'
    if code != 'en':
        try:
            tr = translator.translate(s,dest='en').text
        except:
            tr = s
    else:
        tr = s
    result.append(tr)

vision['bgl_t'] = result
vision.head(3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12140.0), HTML(value='')))




Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions,bgl_t
0,img/16395.png,bethany hamilton,", bethany hamilton","person, clothing","chin, long hair, photo caption, face, nose, ey...","handjobs, sold seperately,",joy,bethany hamilton
1,img/50162.png,diatonic button accordion,musician,"person, microphone","free reed aerophone, music artist, musician, p...","roses are black,, violets are black, exs, ever...",,diatonic button accordion
2,img/70691.png,watermelon,"fruit, melon, watermelon",watermelon,"cucumber, gourd, and melon family, watermelon,...","overdose,",,watermelon


In [9]:
# COUNT AND CLEAN BGL_T
corpus = nltk.word_tokenize(vision.bgl_t.str.cat(sep=' '))
fd = nltk.FreqDist(corpus)
fd_df = pd.DataFrame(fd.items(), columns=['word', 'frequency']).sort_values('frequency',ascending=False)
print('Unique tokens: ',len(fd))

Unique tokens:  5280


In [10]:
fd_df.head()

Unnamed: 0,word,frequency
28,photo,1008
29,caption,1001
27,funny,628
85,cat,325
24,hitler,308


In [11]:
tokens_to_remove = set([
    'photo',
    'caption',
    'photography',
    'stock',
    'poster',
    'meme',
    '``',
    'photograph',
    'cartoon',
    ])
print('Tok to remove: ', len(tokens_to_remove))

Tok to remove:  9


In [12]:
vision.fillna('',inplace=True)
for i in tqdm(range(vision.shape[0]),total=vision.shape[0]):
    for j in range(vision.shape[1]):   
        vision.iloc[i,j] = rm_tokens(tokens_to_remove,vision.iloc[i,j])
vision.head(3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12140.0), HTML(value='')))




Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions,bgl_t
0,img/16395.png,bethany hamilton,bethany hamilton,person clothing,chin long hair face nose eyebrow hair hairstyl...,handjobs sold seperately,joy,bethany hamilton
1,img/50162.png,diatonic button accordion,musician,person microphone,free reed aerophone music artist musician perf...,roses are black violets are black exs everythi...,,diatonic button accordion
2,img/70691.png,watermelon,fruit melon watermelon,watermelon,cucumber gourd and melon family watermelon cit...,overdose,,watermelon


In [28]:
# GET CONTEXT
abstracts, headings, topics = [], [], []

for x in tqdm(vision.bgl_t.values,total=vision.shape[0]):
    abstract, heading, topic = getContext(x)
    abstracts.append(abstract)
    headings.append(heading)
    topics.append(topic)

vision['abstract'] = abstracts
vision['topic'] = topics
vision['heading'] = headings

vision.head(3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12140.0), HTML(value='')))




Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions,bgl_t,abstract,topic,heading
0,img/16395.png,bethany hamilton,bethany hamilton,person clothing,chin long hair face nose eyebrow hair hairstyl...,handjobs sold seperately,joy,bethany hamilton,,Shark attack victims,
1,img/50162.png,diatonic button accordion,musician,person microphone,free reed aerophone music artist musician perf...,roses are black violets are black exs everythi...,,diatonic button accordion,,Accordion - Accordions are a family of box-sha...,
2,img/70691.png,watermelon,fruit melon watermelon,watermelon,cucumber gourd and melon family watermelon cit...,overdose,,watermelon,,Watermelon A scrambling and trailing vine in t...,


In [29]:
#FASTER RCNN PRETRAINED ON VISUAL GENOME
"""
from lxmert.processing_image import Preprocess
from lxmert.visualizing_image import SingleImageViz
from lxmert.modeling_frcnn import GeneralizedRCNN
from lxmert.utils import Config
"""
import lxmert.utils as utils
import numpy as np 

In [30]:
URL = "https://pbs.twimg.com/media/Ei67zHZWoAAoOBQ?format=jpg&name=small"
OBJ_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt"
ATTR_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt"

objids = utils.get_data(OBJ_URL)
attrids = utils.get_data(ATTR_URL)

In [31]:
img = []
sentences = []

threshold = 0.5

for npz_file in tqdm(os.listdir('../data/external/frcnn_features')):

    img_id = int(npz_file[:-4])

    features = np.load('../data/external/frcnn_features/'+npz_file,allow_pickle=True)

    sentence = ""

    features = {key:features[key].item() for key in features}['arr_0']

    dup_obj = dict()

    for i, obj in enumerate(features['obj_ids'][0]):
        if features['obj_probs'][0][i] > threshold:
            word = objids[obj]
            att = attrids[features['attr_ids'][0][i]]
            if word not in dup_obj:
                dup_obj[word] = set(att)
            elif att not in dup_obj[word]:
                dup_obj[word].add(att)
                sentence+= att + ' ' + word + " , "
        else:
            continue
    
    img.append(img_id)
    sentences.append(sentence)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12140.0), HTML(value='')))




In [32]:
result = pd.DataFrame({
    'id':img,
    'frcnn':sentences
})

In [39]:
print(result.shape)
result.frcnn[0]

(12140, 2)


'smiling man , white man , brown shirt , blue man , blue eye , long sleeve , blue sleeve , '

In [41]:
vision['id'] = vision.img.apply(lambda x: int(x[4:-5]))

In [42]:
vision = pd.merge(vision,result,on='id',how="left")
print(vision.shape)
vision.head(3)

(12140, 13)


Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions,bgl_t,abstract,topic,heading,id,frcnn
0,img/16395.png,bethany hamilton,bethany hamilton,person clothing,chin long hair face nose eyebrow hair hairstyl...,handjobs sold seperately,joy,bethany hamilton,,Shark attack victims,,16395,"brown eye , blue ball ,"
1,img/50162.png,diatonic button accordion,musician,person microphone,free reed aerophone music artist musician perf...,roses are black violets are black exs everythi...,,diatonic button accordion,,Accordion - Accordions are a family of box-sha...,,50162,
2,img/70691.png,watermelon,fruit melon watermelon,watermelon,cucumber gourd and melon family watermelon cit...,overdose,,watermelon,,Watermelon A scrambling and trailing vine in t...,,70691,"large ear ,"


In [44]:
#SLURS, not sure by now

In [45]:
# DIF TEXT WITH OCR TEXT, not sure by now

"\ntest_unseen = pd.read_json('../data/raw/test_unseen.jsonl', lines=True)\ntrain = pd.read_json('../data/raw/train.jsonl', lines=True)\ndev_unseen = pd.read_json('../data/raw/dev_unseen.jsonl', lines=True)\n\ndata = pd.concat([train,dev_unseen,test_unseen],axis=0)[['id','img','text']]\ndata.shape"

In [48]:
maxl = 0
minl = 0
for x in vision.topic.values:
    maxl = max(maxl,len(x))
    minl = min(minl,len(x))
print(maxl)
print(minl)

357
0


In [51]:
print(vision.topic.values[0:30])

['Shark attack victims'
 'Accordion - Accordions are a family of box-shaped musical instruments of the bellows-driven free-reed aerophone type, colloquially referred to as a squeezebox. A person who plays the accordion is called an accordionist. The concertina and bandoneón are related.'
 'Watermelon A scrambling and trailing vine in the flowering plant family Cucurbitaceae. The species...'
 ''
 'War in Afghanistan (2001–present) following United States invasion of Afghanistan Involvement against the Taliban and al-Qaeda by...'
 '' '' '' ''
 'Beard The hair that grows on the chin, upper lip, cheeks and neck of humans and some non-human animals.'
 '' '' ''
 'Shoulder The human shoulder is made up of three bones: the clavicle, the scapula, and the humerus as well...'
 '' 'Outer space' '' '' '' '' ''
 'Blond A hair color characterized by low levels of the dark pigment eumelanin. The resultant visible...'
 '' ''
 'Neck The part of the body, on many vertebrates, that separates the head from

In [55]:
vision.drop(['abstract','heading'],axis=1,inplace=True)
vision.head()

Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions,bgl_t,topic,id,frcnn
0,img/16395.png,bethany hamilton,bethany hamilton,person clothing,chin long hair face nose eyebrow hair hairstyl...,handjobs sold seperately,joy,bethany hamilton,Shark attack victims,16395,"brown eye , blue ball ,"
1,img/50162.png,diatonic button accordion,musician,person microphone,free reed aerophone music artist musician perf...,roses are black violets are black exs everythi...,,diatonic button accordion,Accordion - Accordions are a family of box-sha...,50162,
2,img/70691.png,watermelon,fruit melon watermelon,watermelon,cucumber gourd and melon family watermelon cit...,overdose,,watermelon,Watermelon A scrambling and trailing vine in t...,70691,"large ear ,"
3,img/37405.png,lavadora para edredones king size,duvet clothes dryer lavanderia self service ma...,washing machine home appliance person,major appliance clothes dryer home appliance l...,introducing fidget spinner for women.,,washing machine for king size duvets,,37405,"white bus , black bus ,"
4,img/73506.png,guerra de afganistan,war in afghanistan afghanistan united states,footwear clothing pants luggage & bags person,organism font human text adaptation,look your dad fucked goats. i capped him.orry.,,war in afghanistan,War in Afghanistan (2001–present) following Un...,73506,"blue wall , standing woman , long woman ,"


In [59]:
vision.topic = vision.topic.apply(lambda x : " ".join(x.split()[:25]))
print(vision.topic[1])
vision.head()

Accordion - Accordions are a family of box-shaped musical instruments of the bellows-driven free-reed aerophone type, colloquially referred to as a squeezebox. A person who


Unnamed: 0,img,best_guess_labels,web_entities,object_labels,image_labels,texts,expressions,bgl_t,topic,id,frcnn
0,img/16395.png,bethany hamilton,bethany hamilton,person clothing,chin long hair face nose eyebrow hair hairstyl...,handjobs sold seperately,joy,bethany hamilton,Shark attack victims,16395,"brown eye , blue ball ,"
1,img/50162.png,diatonic button accordion,musician,person microphone,free reed aerophone music artist musician perf...,roses are black violets are black exs everythi...,,diatonic button accordion,Accordion - Accordions are a family of box-sha...,50162,
2,img/70691.png,watermelon,fruit melon watermelon,watermelon,cucumber gourd and melon family watermelon cit...,overdose,,watermelon,Watermelon A scrambling and trailing vine in t...,70691,"large ear ,"
3,img/37405.png,lavadora para edredones king size,duvet clothes dryer lavanderia self service ma...,washing machine home appliance person,major appliance clothes dryer home appliance l...,introducing fidget spinner for women.,,washing machine for king size duvets,,37405,"white bus , black bus ,"
4,img/73506.png,guerra de afganistan,war in afghanistan afghanistan united states,footwear clothing pants luggage & bags person,organism font human text adaptation,look your dad fucked goats. i capped him.orry.,,war in afghanistan,War in Afghanistan (2001–present) following Un...,73506,"blue wall , standing woman , long woman ,"


In [60]:
vision.fillna('')
vision.to_csv('../data/interim/vision.csv')