In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from os import path
import time 
from datetime import datetime 
import math

import pickle

data_dir = path.join('..', 'data')
img_dir = path.join(data_dir, 'img')
temp_dir = path.join(data_dir, 'temp')

dataset_file = 'reddit_wsb_art.csv'

In [34]:
data = pd.read_csv(path.join(data_dir, dataset_file))
data = data.sort_values(by=['timestamp'])
data.body = data.body.astype(str)
data.title = data.title.astype(str)
data.body = data.body.apply(lambda x: x.encode('utf-8').decode('unicode-escape'))

In [35]:
data

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,upvote_ratio,is_oc,permalink,name,is_self
28052,MTCH IS A ONE WAY TICKET TO ORION'S BELT,9,l2a333,https://www.reddit.com/r/wallstreetbets/commen...,19,1.611270e+09,Unless mass castration becomes a popular trend...,Fri Jan 22 00:00:17 2021,0.61,False,/r/wallstreetbets/comments/l2a333/mtch_is_a_on...,t3_l2a333,True
28937,Its no GME yolo but this is a pretty big short...,33,l2a5hd,https://i.redd.it/5brnguo1orc61.jpg,23,1.611270e+09,,Fri Jan 22 00:03:22 2021,0.79,False,/r/wallstreetbets/comments/l2a5hd/its_no_gme_y...,t3_l2a5hd,False
8571,"Gaybear Appreciation Post, I’m a naked Put sel...",32,l2a615,https://i.redd.it/empum556orc61.jpg,13,1.611270e+09,,Fri Jan 22 00:04:05 2021,0.87,False,/r/wallstreetbets/comments/l2a615/gaybear_appr...,t3_l2a615,False
16672,Game-stop technical analysis. Know when to gam...,126,l2a742,https://i.redd.it/mbmpi88gorc61.jpg,42,1.611270e+09,,Fri Jan 22 00:05:39 2021,0.91,False,/r/wallstreetbets/comments/l2a742/gamestop_tec...,t3_l2a742,False
7839,$SPCE is going to break out soon,132,l2a87f,https://www.reddit.com/r/wallstreetbets/commen...,52,1.611270e+09,Here are my ideas: cup and handle ready to go\...,Fri Jan 22 00:07:08 2021,0.96,False,/r/wallstreetbets/comments/l2a87f/spce_is_goin...,t3_l2a87f,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35697,CBOE Short Interest Report 1/27,2,l6gxwq,https://www.cboe.com/us/equities/market_statis...,5,1.611788e+09,,Wed Jan 27 23:59:34 2021,0.67,False,/r/wallstreetbets/comments/l6gxwq/cboe_short_i...,t3_l6gxwq,False
35944,am i late on this?,8,l6gxx2,https://www.reddit.com/r/wallstreetbets/commen...,19,1.611788e+09,"are there plans to hold more stocks, i got 300...",Wed Jan 27 23:59:35 2021,0.83,False,/r/wallstreetbets/comments/l6gxx2/am_i_late_on...,t3_l6gxx2,True
584,HOLD THE LINE!!!,21,l6gxzl,https://i.redd.it/pbm0tkntgyd61.png,7,1.611788e+09,,Wed Jan 27 23:59:40 2021,0.96,False,/r/wallstreetbets/comments/l6gxzl/hold_the_line/,t3_l6gxzl,False
33509,"All in on NOK, BB, and my bb AMC. See you in t...",99,l6gy1h,https://i.redd.it/poggax5vgyd61.jpg,15,1.611788e+09,,Wed Jan 27 23:59:44 2021,0.80,False,/r/wallstreetbets/comments/l6gy1h/all_in_on_no...,t3_l6gy1h,False


# Time

In [36]:
def encode_time(timestamps):
    str_time = [x.split()[3] for x in timestamps]

    splited_time = [x.split(':') for x in str_time]
    float_time = [float(x[0] + '.' + x[1]) for x in splited_time]

    sin_time = [np.sin(2 * np.pi * x / 23.59) for x in float_time]
    cos_time = [np.cos(2 * np.pi * x / 23.59) for x in float_time]
    
    return sin_time, cos_time

sin_time, cos_time = encode_time(data.timestamp.values)
data['sin_time'] = sin_time
data['cos_time'] = cos_time

In [37]:
def encode_date(unix_timestamps):
    days_of_year = [time.localtime(x).tm_yday for x in unix_timestamps]
    
    sin_date = [np.sin(2 * np.pi * x / 365.0) for x in days_of_year]
    cos_date = [np.cos(2 * np.pi * x / 365.0) for x in days_of_year]
    
    return sin_date, cos_date

sin_date, cos_date = encode_date(data.created.values)

data['sin_date'] = sin_date
data['cos_date'] = cos_date

# Words count

In [38]:
def words_in_titles(df):
    titles = df.title.values
    len_words = [len(title.split()) for title in titles]
    
    df['words_in_titles'] = len_words
    return df

data = words_in_titles(data)

In [39]:
def words_in_bodies(df, skip_empty=False):
    bodies = df.body.values
    len_words = [len(str(body).split()) if str(body) != 'nan' else 0 for body in bodies]
    
    df['words_in_body'] = len_words
    return df

data = words_in_bodies(data)

In [40]:
data[['words_in_titles', 'words_in_body']]

Unnamed: 0,words_in_titles,words_in_body
28052,9,225
28937,19,0
8571,24,0
16672,20,0
7839,7,84
...,...,...
35697,5,0
35944,5,15
584,3,0
33509,18,0


# Text Normalization

In [41]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [42]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [43]:
sentance = data.iloc[1, 0]
# print(sentance)
# out = ' '.join([lancaster.stem(word) for word in sentance.split()])
# out

In [44]:
def tokenize(text):
    text = str(text)
    stopwords_list = stopwords.words('english')
    ret = [word for word in word_tokenize(text.lower()) if word.isalpha() and word not in stopwords_list]
    return list(set(ret))

def tokenize_title_body(df):
    df['title_tokens'] = df.apply(lambda x: tokenize(x['title']), axis=1)
    df['body_tokens']  = df.apply(lambda x: tokenize(x['body']), axis=1)
    
tokenize_title_body(data)

In [45]:
def stem_(tokens):
    return [porter.stem(word) for word in tokens]

def stem_title_body(df):
    df['title_stem_tokens'] = df.apply(lambda x: stem_(x['title_tokens']), axis=1)
    df['body_stem_tokens']  = df.apply(lambda x: stem_(x['body_tokens']), axis=1)
    
stem_title_body(data)

In [14]:
backup = data.copy()

In [46]:
data.to_csv(path.join(data_dir, "data_temp.csv"), sep='`', index=False)

# Text statistics

In [16]:
def count_capital_letters(text):
    text = str(text)
    return sum([1 for char in text if 91 > ord(char) > 64])

def capital_letters_ratio(text):
    text = str(text)
    alphacount = sum([1 for char in text if str(char).isalpha()])
    if alphacount == 0:
        return 0
    return count_capital_letters(text) / alphacount 

data['title_capital_letters_count'] = data.apply(lambda x: count_capital_letters(x['title']), axis=1) 
data['title_capital_letters_ratio'] = data.apply(lambda x: capital_letters_ratio(x['title']), axis=1).apply(lambda x: np.around(x, 3))

data['body_capital_letters_count'] = data.apply(lambda x: count_capital_letters(x['body']), axis=1) 
data['body_capital_letters_ratio'] = data.apply(lambda x: capital_letters_ratio(x['body']), axis=1).apply(lambda x: np.around(x, 3))

## URLs

In [17]:
import re

RE_HTTP = re.compile("http(s)?://[/\.A-z0-9]+")

def detect_urls(text):
    text = str(text)

    return [str(x[1].group(0)) for x in enumerate(re.finditer(RE_HTTP, text))]

data['body_url'] = data.apply(lambda x: detect_urls(x['body']), axis=1) 
data['body_urls_count'] = data['body_url'].apply(len)

## Emojis

In [18]:
from emoji import UNICODE_EMOJI

EMOJIS = set(UNICODE_EMOJI['en'].keys())

def filter_emojis(text):
    text = str(text)
    res = []
    for word in text.split(' '):
        for char in word:
            if char in EMOJIS:
                res.append(char)
                
    return res

In [19]:
def emojis_ratio(df_origin, df_emojis):
    x = df_origin.apply(len)
    x = np.where(x == 0, 1, x)
    return (df_emojis.apply(len) / x).apply(lambda x: np.around(x, 3))

In [20]:
data['title_emojis'] = data.apply(lambda x: filter_emojis(x['title']), axis=1)
data['title_emoji_count'] = data['title_emojis'].apply(len)
data['title_emojis_ratio'] = emojis_ratio(data['title'], data['title_emojis'])

In [21]:
data['body'] = data['body'].astype(str)
data['body_emojis'] = data.apply(lambda x: filter_emojis(x['body']), axis=1)
data['body_emoji_count'] = data['body_emojis'].apply(len)
data['body_emojis_ratio'] = emojis_ratio(data['body'], data['body_emojis'])

In [22]:
data[['title', 'title_emojis', 'title_emoji_count', 'title_emojis_ratio']].loc[data['title_emoji_count'] > 0]

Unnamed: 0,title,title_emojis,title_emoji_count,title_emojis_ratio
20773,Got too cocky after initial gme gains 😢,[😢],1,0.026
889,$RIDE I don’t need a therapist... I just need ...,"[💎, 🖕, 🏽]",3,0.040
6344,To all the 🌈🐻 trying to keep us down today,"[🌈, 🐻]",2,0.048
11108,AMD baby will grab more on any dip 😈 SU BAE sh...,[😈],1,0.016
8914,Coverage of the GME action today 🚀 🌕,"[🚀, 🌕]",2,0.056
...,...,...,...,...
26862,NOK 🚀🚀🚀🚀🚀,"[🚀, 🚀, 🚀, 🚀, 🚀]",5,0.556
30006,BRAGG⬆️🚀,"[⬆, 🚀]",2,0.250
17756,Until Discord server is back join to the new s...,"[🚀, 🚀]",2,0.037
34490,Let's sent BB into Space. 🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀,"[🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀]",15,0.366


In [23]:
data[['body', 'body_emojis', 'body_emoji_count', 'body_emojis_ratio']].loc[data['body_emoji_count'] > 0]

Unnamed: 0,body,body_emojis,body_emoji_count,body_emojis_ratio
10187,I know the first thing that pops into all of o...,[🚀],1,0.000
11161,First of all I'm in. I get the vision and I'm ...,"[🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🥜, 🥜, 🥜, 🥜, 🥜, 🙌, ...",22,0.009
27415,(Discretion: First post here. Go easy on me fe...,"[🚀, 🚀, 🚀, 💩, 🚀, 🚀, 📈, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, ...",22,0.005
14278,I’m currently holding 250 shares and hoping to...,"[🚀, 🚀, 🚀, 🚀]",4,0.004
11956,"Newer here, first time poster. I think I did t...","[🌈, 🐻]",2,0.001
...,...,...,...,...
2985,"Hello, you rich beautiful degenerates. \n\nLet...","[🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🔥, 🔥, 🔥]",9,0.005
26862,I don’t have a lot of many but I bought 7 shar...,"[🚀, 🚀, 🚀, 💎, 🚀, 🚀, 🚀, 🚀]",8,0.129
1827,Tl;dr: Fear was pushed hard today. But we push...,"[🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀, 🚀]",10,0.007
30006,BRAGG - Under the Radar! Next level gaming st...,"[📡, 🔺, ⬆, 🚀, 🚀]",5,0.086


## OCR

In [24]:
with open('meme_ocr.pkl', 'rb') as file:
    test = pickle.load(file)

In [25]:
data['image_text'] = test

data.image_text = data.apply(lambda x: tokenize(x.image_text), axis=1)
data.image_text = data.apply(lambda x: stem_(x.image_text), axis=1)

data['image_text_words'] = data.apply(lambda x: len(x.image_text), axis=1)
data['image_text_capital_letters_count'] = data.apply(lambda x: count_capital_letters(x.image_text), axis=1) 
data['image_text_capital_letters_ratio'] = data.apply(lambda x: capital_letters_ratio(x.image_text), axis=1).apply(lambda x: np.around(x, 3))

In [26]:
data[['image_text','image_text_words','image_text_capital_letters_count','image_text_capital_letters_ratio']].loc[data.image_text.apply(len).gt(0)]

Unnamed: 0,image_text,image_text_words,image_text_capital_letters_count,image_text_capital_letters_ratio
11340,"[per, portion, expens, tax, class, par, invent...",67,0,0.0
21116,"[help, track, pm, manufactur, last, thursday, ...",67,0,0.0
37093,"[pe, eer, sf, um, cover, ff, noth, lq, ef, aca...",96,0,0.0
11956,"[inseego, corp, eae, oe, pice, ba, c]",7,0,0.0
4818,"[dunn, smalicap, rebalanc, cover, estim, fund,...",202,0,0.0
...,...,...,...,...
596,"[corp, stephen, chart, intern, blackston, tp, ...",35,0,0.0
32720,"[open, day, bbo, l, portfolio, avg, price, lim...",17,0,0.0
9453,"[on, amount, invest, price, complet, order, gm...",17,0,0.0
16660,"[open, music, live, zoom, get, happi, x, ke, r...",25,0,0.0


## HSV & Labels

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from os import path
import time 
from datetime import datetime 
import math

import sys
import os
import subprocess
import re
import cv2
import mimetypes
import requests
import re

from PIL import Image

RE_HTTP = re.compile("http(s)?://[/\.A-z0-9]+")

def detect_urls(text):
    text = str(text)
    return [str(x[1].group(0)) for x in enumerate(re.finditer(RE_HTTP, text))]

data['body_url'] = data.apply(lambda x: detect_urls(x['body']), axis=1) 
data['body_urls_count'] = data['body_url'].apply(len)

def is_url_image(url):    
    mimetype,encoding = mimetypes.guess_type(url)
    return (mimetype and mimetype.startswith('image'))

def download_image(url, name):
    try:
        request = requests.get(url, stream = True)
        status = request.status_code
    except:
        status = -1
    
    if status == 200:
        with open(path.join(temp_dir, name), 'wb') as file:
            file.write(request.content)
            return True
    else:
        return False

def get_name(url):
    return url.split('/')[-1]

def correct_reddit_preview_url(url):
    return url.replace('preview.redd.it', 'i.redd.it')

from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array, load_img
import numpy as np
from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array
from skimage.color import rgb2hsv
import numpy as np
import math

def get_img_avg_colors(image):

    X = 0.0
    Y = 0.0
    count = 0
    sat = 0
    val = 0
    
    hsv = rgb2hsv(image)
    for i in range(0,image.shape[0], 44):
        for j in range(0,image.shape[1], 44):
            X += math.cos(hsv[i,j,0] / 180.0 * math.pi)
            Y += math.sin(hsv[i,j,0] / 180.0 * math.pi)
            sat += hsv[i,j,1]
            val += hsv[i,j,2]
            count += 1

    X /= count
    Y /= count
    avg_hue = math.atan2(Y, X) * 180.0 / math.pi;
    avg_sat = sat / count
    avg_val = val / count
    return avg_hue, avg_sat, avg_val

from keras.applications.vgg19  import VGG19, preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array
model = VGG19()

def get_img_label(image):
    
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    yhat  = model.predict(image)
    label = decode_predictions(yhat, top=5)
    label = label[0][0]
    return label[1]

def process_urls(url_list):
    labels = []
    colors = []
    colors_stdevs = []
    
    for url in url_list:
        if is_url_image(url):
            name = get_name(url)
            url = correct_reddit_preview_url(url)
            
            file_exists = False
            path_to_file = os.path.join(temp_dir, name)
            if os.path.exists(path_to_file):
                file_exists = True
            else:
                download_successful = download_image(url, name)
                file_exists = download_successful
                
            if file_exists:
  
                if path_to_file[-3:] == "gif":
                    label = ""
                    avg_col = (-1, 0, 0)
                else:
                    image = load_img(path_to_file, target_size=(224, 224))
                    image = img_to_array(image)

                    label = get_img_label(image)
                    avg_col = get_img_avg_colors(image)

                labels.append(label)
                colors.append(avg_col)
                
    return labels, colors

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5


In [32]:
temp = data['body_url'].apply(lambda x: process_urls(x))
data['imgs_labels'] = temp.apply(lambda x: x[0])
data['imgs_colors'] = temp.apply(lambda x: x[1])

UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node vgg19/block1_conv1/Relu (defined at <ipython-input-27-4aabc24358c7>:96) ]] [Op:__inference_predict_function_704]

Function call stack:
predict_function


In [None]:
data['imgs_count'] = data['imgs_labels'].apply(lambda x: len(x))

In [None]:
data[['body_url','imgs_labels','imgs_colors','imgs_count']].loc[data.imgs_count > 0]

In [None]:
from collections import Counter
c = Counter(np.concatenate(data.imgs_labels.loc[data.imgs_labels.apply(len).gt(0)].tolist()))

In [None]:
c.most_common()

In [None]:
data2 = data.copy()
data2.body = data2.body.apply(lambda x: " ".join(str.splitlines(x)))
data2.to_csv(path.join(data_dir, "merged_data5.csv"), sep='`', index=False)

In [31]:
import tensorflow