In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from os import path
import time 
from datetime import datetime 
import math

import sys
import os
import subprocess
import re
import cv2
import mimetypes
import requests
import re

In [None]:
data_dir = path.join('..', 'data')
img_dir = path.join(data_dir, 'img')
temp_dir = path.join(data_dir, 'temp')

dataset_file = 'reddit_wsb_art.csv'

data = pd.read_csv(path.join(data_dir, dataset_file))
data = data.sort_values(by=['timestamp'])

In [None]:
RE_HTTP = re.compile("http(s)?://[/\.A-z0-9]+")

def detect_urls(text):
    text = str(text)

    return [str(x[1].group(0)) for x in enumerate(re.finditer(RE_HTTP, text))]

data['body_url'] = data.apply(lambda x: detect_urls(x['body']), axis=1) 
data['body_urls_count'] = data['body_url'].apply(len)

data[['body', 'body_url', 'body_urls_count']].loc[data['body_urls_count'] > 0]

In [None]:
def is_url_image(url):    
    mimetype,encoding = mimetypes.guess_type(url)
    return (mimetype and mimetype.startswith('image'))

In [None]:
def download_image(url, name):
    try:
        request = requests.get(url, stream = True)
        status = request.status_code
    except:
#         print('Connection error: ', url)
        status = -1
    
    if status == 200:
        with open(path.join(temp_dir, name), 'wb') as file:
            file.write(request.content)
#             print('Image sucessfully Downloaded: ', name, ' From: ', url)
            return True
    else:
#         print("Image Couldn't be retreived")
        return False

In [None]:
def get_name(url):
    return url.split('/')[-1]

In [None]:
def correct_reddit_preview_url(url):
    return url.replace('preview.redd.it', 'i.redd.it')

In [None]:
from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array, load_img

import numpy as np

from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array

from skimage.color import rgb2hsv
import numpy as np

import math

def get_img_avg_colors(image_path):
    image = load_img(image_path)
    image = img_to_array(image)
    
#     image = image / 255.0
    
    X = 0.0
    Y = 0.0

    count = 0
    sat = 0
    val = 0
    
    hsv = rgb2hsv(image)

    for i in range(0,image.shape[0], 128):
        for j in range(0,image.shape[1], 128):
            X += math.cos(hsv[i,j,0] / 180.0 * math.pi)
            Y += math.sin(hsv[i,j,0] / 180.0 * math.pi)
            sat += hsv[i,j,1]
            val += hsv[i,j,2]
            count += 1

    #Now average the X and Y values
    X /= count
    Y /= count

    avg_hue = math.atan2(Y, X) * 180.0 / math.pi;
    avg_sat = sat / count
    avg_val = val / count

    return avg_hue, avg_sat, avg_val

In [None]:
from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array

model = VGG19()

def get_img_label(image_path):
    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    yhat  = model.predict(image)
    label = decode_predictions(yhat, top=5)
    label = label[0][0]
#     print('%s (%.2f%%)' % (label[1], label[2]*100))
    return label[1]

In [None]:
def process_urls(url_list):

    labels = []
    colors = []
    colors_stdevs = []
    
    for url in url_list:
        if is_url_image(url):
            name = get_name(url)
            url = correct_reddit_preview_url(url)
            
            file_exists = False
            path_to_file = os.path.join(temp_dir, name)
            if os.path.exists(path_to_file):
                file_exists = True
            else:
                download_successful = download_image(url, name)
                file_exists = download_successful

#             print(path_to_file)
            if file_exists:
#                 label = 'nothing'
                label = get_img_label(path_to_file)
                labels.append(label)
                
#                 avg_col = 0,0,0
                avg_col = get_img_avg_colors(path_to_file)
                colors.append(avg_col)
                
#     print(colors, colors_stdevs)
    return labels, colors

In [None]:
temp = data['body_url'].apply(lambda x: process_urls(x))

In [None]:
data['imgs_labels'] = temp.apply(lambda x: x[0])
data['imgs_colors'] = temp.apply(lambda x: x[1])

In [None]:
data['imgs_colors']

In [None]:
data['imgs_count'] = data['imgs_labels'].apply(lambda x: len(x) if x == [] else 0)


In [None]:
data[['body_url', 'body_urls_count', 'imgs_colors','imgs_count']].loc[data['imgs_count'].gt(0)]

In [None]:
data[['body_url', 'body_urls_count','imgs_count']].loc[data['imgs_count'].gt(0)]

In [None]:
data[['body_url', 'body_urls_count', 'imgs_labels','imgs_count']].loc[data['imgs_count'].gt(0)]

In [None]:
list(set(np.concatenate(data.imgs_labels.tolist())))

In [None]:
data.to_csv("processed.csv")

In [None]:
data2 = pd.read_csv("processed.csv")

In [None]:
data2[['body_url', 'body_urls_count', 'imgs_labels','imgs_count']].loc[data['imgs_count'].gt(0)]

In [None]:
data.shape