In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from os import path
import time 
from datetime import datetime 
import math

import sys
import os
import subprocess
import re
import cv2
import mimetypes
import requests
import re

In [2]:
data_dir = path.join('..', 'data')
img_dir = path.join(data_dir, 'img')
temp_dir = path.join(data_dir, 'temp')

dataset_file = 'reddit_wsb_art.csv'

data = pd.read_csv(path.join(data_dir, dataset_file))
data = data.sort_values(by=['timestamp'])

In [3]:
RE_HTTP = re.compile("http(s)?://[/\.A-z0-9]+")

def detect_urls(text):
    text = str(text)

    return [str(x[1].group(0)) for x in enumerate(re.finditer(RE_HTTP, text))]

data['body_url'] = data.apply(lambda x: detect_urls(x['body']), axis=1) 
data['body_urls_count'] = data['body_url'].apply(len)

data[['body', 'body_url', 'body_urls_count']].loc[data['body_urls_count'] > 0]

Unnamed: 0,body,body_url,body_urls_count
11340,"When I saw the video I was in shock, I could n...","[https://www.marketwatch.com/story/gamestop, h...",7
21116,Intel posted earnings today of $1.52 EPS beati...,"[https://preview.redd.it/2lqwp6j4urc61.png, ht...",3
10187,I know the first thing that pops into all of o...,"[https://www.suncor.com/en, https://www.suncor...",5
26804,"Well guys, I lost 90k today on SPY calls which...",[https://imgur.com/8zoihg7\n\n],1
7883,Stocks been up 46% in the past 5 days. They ma...,[https://nypost.com/2020/12/27/danimer],1
...,...,...,...
30079,I don't use the platform to trade but a fellow...,[https://preview.redd.it/5k3fq0h6gyd61.png],1
35388,[https://www.marketwatch.com/story/gamestop-an...,"[https://www.marketwatch.com/story/gamestop, h...",4
2985,"Hello, you rich beautiful degenerates. \n\nLet...","[https://heroicheartsproject.org, https://hunt...",9
17756,https://discord.gg/yc5772JExT,[https://discord.gg/yc5772JExT],1


In [99]:
def is_url_image(url):    
    mimetype,encoding = mimetypes.guess_type(url)
    return (mimetype and mimetype.startswith('image'))

In [100]:
def download_image(url, name):
    try:
        request = requests.get(url, stream = True)
        status = request.status_code
    except:
#         print('Connection error: ', url)
        status = -1
    
    if status == 200:
        with open(path.join(temp_dir, name), 'wb') as file:
            file.write(request.content)
#             print('Image sucessfully Downloaded: ', name, ' From: ', url)
            return True
    else:
#         print("Image Couldn't be retreived")
        return False

In [101]:
def get_name(url):
    return url.split('/')[-1]

In [102]:
def correct_reddit_preview_url(url):
    return url.replace('preview.redd.it', 'i.redd.it')

In [110]:
from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array

import numpy as np

def get_img_avg_colors(image_path):
    image = load_img(image_path)
    image = img_to_array(image)
    
    R = np.average(image[:,:,0])
    G = np.average(image[:,:,1])    
    B = np.average(image[:,:,2])   
    
    return (R, G, B)

def get_img_stdev_colors(image_path):
    image = load_img(image_path)
    image = img_to_array(image)

    R = np.std(image[:,:,0])
    G = np.std(image[:,:,1])    
    B = np.std(image[:,:,2]) 
    
    return (R, G, B)

In [111]:
from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array

model = VGG19()

def get_img_label(image_path):
    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    yhat  = model.predict(image)
    label = decode_predictions(yhat, top=5)
    label = label[0][0]
    print('%s (%.2f%%)' % (label[1], label[2]*100))
    return label[1]

In [None]:
def process_urls(url_list):

    labels = []
    colors = []
    colors_stdevs = []
    
    for url in url_list:
        if is_url_image(url):
            name = get_name(url)
            url = correct_reddit_preview_url(url)
            
            file_exists = False
            path_to_file = os.path.join(temp_dir, name)
            if os.path.exists(path_to_file):
                file_exists = True
            else:
                download_successful = download_image(url, name)
                file_exists = download_successful

            if file_exists:
                label = get_img_label(path_to_file)
                labels.append(label)
                
                avg_col = get_img_avg_colors(path_to_file)
                colors.append(avg_col)
            
                stdev_col = get_img_stdev_colors(path_to_file)
                colors_stdevs.append(stdev_col)
                
#     print(colors, colors_stdevs)
    return labels, colors, colors_stdevs

In [117]:
data['imgs_labels', 'imgs_colors', 'imgs_colors_stdev'] = data['body_url'].apply(lambda x: process_urls(x))

In [118]:
data['imgs_count'] = data['imgs_labels'].apply(len)

In [119]:
data[['body_url', 'body_urls_count', 'imgs_colors','imgs_count']].loc[data['imgs_count'].gt(0)]

Unnamed: 0,body_url,body_urls_count,imgs_colors,imgs_count
11340,"[https://www.marketwatch.com/story/gamestop, h...",7,"[(226.76402, 241.08191, 248.2532)]",1
21116,"[https://preview.redd.it/2lqwp6j4urc61.png, ht...",3,"[(225.13005, 223.83922, 224.50206), (241.23965...",2
37093,"[https://www.prnewswire.com/news, https://www....",11,"[(238.994, 237.82446, 233.23161), (17.699621, ...",3
11956,[https://www.businesswire.com/news/home/202012...,13,"[(248.52263, 249.83534, 250.27138)]",1
4818,"[https://preview.redd.it/gg7lm3zj1uc61.png, ht...",3,"[(19.777166, 19.959885, 19.987309), (20.383234...",3
...,...,...,...,...
596,[https://www.opensecrets.org/industries/indus....,3,"[(226.66095, 227.12422, 231.42264)]",1
32720,[https://preview.redd.it/8s5i6d5vdyd61.png],1,"[(249.72418, 250.73422, 250.41019)]",1
9453,[https://i.imgur.com/rdDxsH1.jpg],1,"[(138.96242, 224.57838, 140.5303)]",1
16660,[https://preview.redd.it/m9s7p07egyd61.jpg],1,"[(25.943886, 26.521742, 27.73743)]",1


In [120]:
data[['body_url', 'body_urls_count', 'imgs_colors_stdev','imgs_count']].loc[data['imgs_count'].gt(0)]

Unnamed: 0,body_url,body_urls_count,imgs_colors_stdev,imgs_count
11340,"[https://www.marketwatch.com/story/gamestop, h...",7,"[(41.796696, 36.680386, 36.45837)]",1
21116,"[https://preview.redd.it/2lqwp6j4urc61.png, ht...",3,"[(77.9051, 78.75794, 77.80692), (39.313423, 38...",2
37093,"[https://www.prnewswire.com/news, https://www....",11,"[(45.129658, 47.326572, 57.326927), (32.20176,...",3
11956,[https://www.businesswire.com/news/home/202012...,13,"[(31.974718, 24.030582, 24.292667)]",1
4818,"[https://preview.redd.it/gg7lm3zj1uc61.png, ht...",3,"[(31.63162, 31.7551, 31.800894), (33.59549, 33...",3
...,...,...,...,...
596,[https://www.opensecrets.org/industries/indus....,3,"[(40.476543, 43.47888, 39.951298)]",1
32720,[https://preview.redd.it/8s5i6d5vdyd61.png],1,"[(27.400305, 21.583363, 23.587692)]",1
9453,[https://i.imgur.com/rdDxsH1.jpg],1,"[(124.75366, 41.178947, 122.59485)]",1
16660,[https://preview.redd.it/m9s7p07egyd61.jpg],1,"[(58.87528, 57.98175, 62.35952)]",1


In [121]:
data[['body_url', 'body_urls_count', 'imgs_labels','imgs_count']].loc[data['imgs_count'].gt(0)]

Unnamed: 0,body_url,body_urls_count,imgs_labels,imgs_count
11340,"[https://www.marketwatch.com/story/gamestop, h...",7,[menu],1
21116,"[https://preview.redd.it/2lqwp6j4urc61.png, ht...",3,"[web_site, menu]",2
37093,"[https://www.prnewswire.com/news, https://www....",11,"[menu, oscilloscope, oscilloscope]",3
11956,[https://www.businesswire.com/news/home/202012...,13,[web_site],1
4818,"[https://preview.redd.it/gg7lm3zj1uc61.png, ht...",3,"[oscilloscope, oscilloscope, oscilloscope]",3
...,...,...,...,...
596,[https://www.opensecrets.org/industries/indus....,3,[web_site],1
32720,[https://preview.redd.it/8s5i6d5vdyd61.png],1,[menu],1
9453,[https://i.imgur.com/rdDxsH1.jpg],1,[web_site],1
16660,[https://preview.redd.it/m9s7p07egyd61.jpg],1,[web_site],1
