In [None]:
import numpy as np
import pandas as pd
import glob
import urllib
from urllib import request
from matplotlib import pyplot as plt
import requests
import os
import cv2
from google.colab import drive
from collections import Counter
from IPython.display import display, Image, SVG, Math, YouTubeVideo
from tqdm.notebook import tqdm
from google.colab.patches import cv2_imshow

connecting to google drive and changing directory to our shared directory

In [None]:
drive.mount('/content/drive/')
os.chdir("/content/drive/My Drive/ITC/project2/")

Loading the Dataset (images are not saved yet, just urls)

In [None]:
path = './'
documents = ['photos', 'keywords']
datasets = {}

for doc in documents:
    files = glob.glob(path + doc + ".tsv*")

    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)

    datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [None]:
photos_df = datasets['photos'].iloc[:,:3]
print(photos_df.shape)
photos_df.head()

In [None]:
key_words_df = datasets['keywords'].iloc[:,:3]
print(key_words_df.shape)
key_words_df

In [None]:
confident_keywords = key_words_df[key_words_df.ai_service_1_confidence > 99]
confident_keywords.shape

In [None]:
confident_keywords

In [None]:
merged_photos_links_and_confident_keywords = confident_keywords.merge(datasets['photos'], on='photo_id')
merged_photos_links_and_confident_keywords

In [None]:
tagged_links = merged_photos_links_and_confident_keywords[['keyword', 'photo_image_url']]
tagged_links

In [None]:
tagged_links.nunique()

In [None]:
most_tagged_words = Counter(tagged_links['keyword']).most_common(10)
key_words = dict(most_tagged_words).keys()

In [None]:
key_words = list(key_words)

In [None]:
commons_df = tagged_links[tagged_links['keyword'].isin(key_words)]
commons_df

In [None]:
dummies_per_url_df = pd.get_dummies(commons_df, prefix=['keyword'], columns=['keyword']).groupby(['photo_image_url']).sum()
dummies_per_url_df

In [None]:
dummies_per_url_df = dummies_per_url_df.reset_index()

In [None]:
dummies_per_url_df.head()

In [None]:
def image_url_to_numpy_array_urllib(url, format=None):
    ## read as HTTPResponse 
    resp = urllib.request.urlopen(url)
    ## read as 1D bytearray
    resp_byte_array = resp.read()
    ## returns a bytearray object which is a mutable sequence of integers in
    # the range 0 <=x< 256
    mutable_byte_array = bytearray(resp_byte_array)
    ## read as unsigned integer 1D numpy array
    image = np.asarray(mutable_byte_array, dtype="uint8")
    ## To decode the 1D image array into a 2D format with RGB color components
    # we make a call to cv2.imdecode
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    # filter images with too rectangular ratio
    if (image.shape[0]>(2*image.shape[1]))or(image.shape[1]>(2*image.shape[0])):
      return 'Image ratio is too rectangular'
    image = cv2.resize(image, (224,224))
    if format == 'BGR':
        ## return BGR format array
        return image
    ## cv2.imdecode converted array into BGR format , convert it to RGB format
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # return the image
    return image

In [None]:
for i in range(13):
  test = dummies_per_url_df.iloc[(1000*i):(1000*(i+1))].copy()
  tqdm.pandas()
  test['numpy_array_images'] = test['photo_image_url'].progress_apply(lambda x: image_url_to_numpy_array_urllib(x))
  test = test[test.numpy_array_images != 'Image ratio is too rectangular']
  test.iloc[:,1:11].to_pickle(f'./y{i+1}.pkl')
  test.iloc[:,-1].to_pickle(f'./X{i+1}.pkl')