In [1]:
import pandas as pd
import ast
from collections import Counter
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
data = pd.read_csv("../data/merged_data4.csv", sep='`')
Y = data['score'].copy()
data = data.drop(['title', 'id', 'url',
                  'body', 'permalink', 'name',
                  'timestamp', 'body_url', 'title_tokens',
                  'body_tokens', 'image_text'], axis=1)

def take_most_common(str_list):
    mc = Counter(str_list).most_common()
    if len(mc) > 0:
        return mc[0][0]
    return ""
    
K = ['title_emojis',
     'body_emojis',
     'imgs_labels']

for k in K:
    data[k] = data[k].apply(ast.literal_eval)
    data[k] = data[k].apply(take_most_common)
    
data.imgs_colors = data.imgs_colors.apply(ast.literal_eval)
data.imgs_colors = data.imgs_colors.apply(lambda x: x[0][0] if len(x) > 0 else -1)

In [3]:
K = []
for x in data.keys():
    if isinstance(data[x][0], str):
        K.append(x)

K

['title_emojis', 'body_emojis', 'imgs_labels']

In [4]:
for k in K:
    le = LabelEncoder()       
    le.fit(data[k].tolist())
    data[k] = data[k].apply(lambda x: le.transform([x])[0])

In [5]:
for k in data.keys():
    print(k)
    data[k] = data[k].astype(float)

score
comms_num
created
upvote_ratio
is_oc
is_self
sin_time
cos_time
sin_date
cos_date
words_in_titles
words_in_body
title_capital_letters_count
title_capital_letters_ratio
body_capital_letters_count
body_capital_letters_ratio
body_urls_count
title_emojis
title_emoji_count
title_emojis_ratio
body_emojis
body_emoji_count
body_emojis_ratio
image_text_words
image_text_capital_letters_count
image_text_capital_letters_ratio
imgs_labels
imgs_colors
imgs_count


## Features importance

In [6]:
X = data.copy()
X = X.drop(['created', 'score'], axis=1)

In [7]:
clf = RandomForestRegressor(n_estimators=150,verbose=1,n_jobs=-1, min_samples_leaf=5)
_ = clf.fit(X, Y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    6.2s finished


In [8]:
imp = np.around(clf.feature_importances_, 4)
zipped = sorted(zip(X.keys(), imp), key=lambda x: x[1], reverse=True)

print("Imp.\t Column\n******\t ******************************")
for l, i in zipped:
    print(i, "\t", l)

Imp.	 Column
******	 ******************************
0.7489 	 comms_num
0.077 	 is_self
0.0361 	 words_in_titles
0.0281 	 words_in_body
0.0192 	 upvote_ratio
0.0151 	 cos_time
0.0123 	 title_capital_letters_ratio
0.012 	 sin_time
0.0108 	 cos_date
0.0105 	 sin_date
0.0093 	 title_capital_letters_count
0.0066 	 body_capital_letters_ratio
0.0059 	 body_urls_count
0.005 	 body_capital_letters_count
0.0009 	 title_emojis_ratio
0.0008 	 title_emojis
0.0004 	 title_emoji_count
0.0003 	 is_oc
0.0003 	 body_emojis_ratio
0.0001 	 body_emojis
0.0001 	 body_emoji_count
0.0001 	 imgs_labels
0.0 	 image_text_words
0.0 	 image_text_capital_letters_count
0.0 	 image_text_capital_letters_ratio
0.0 	 imgs_colors
0.0 	 imgs_count


## Features importance (without comms_num and upvote_ratio)

In [9]:
X = X.drop(['comms_num', 'upvote_ratio'], axis=1)

In [10]:
clf = RandomForestRegressor(n_estimators=150,verbose=1,n_jobs=-1, min_samples_leaf=5)
_ = clf.fit(X, Y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    6.4s finished


In [11]:
imp = np.around(clf.feature_importances_, 4)
zipped = sorted(zip(X.keys(), imp), key=lambda x: x[1], reverse=True)

print("Imp.\t Column\n******\t ******************************")
for l, i in zipped:
    print(i, "\t", l)

Imp.	 Column
******	 ******************************
0.1933 	 cos_time
0.1786 	 sin_time
0.1566 	 title_capital_letters_ratio
0.1241 	 words_in_titles
0.0878 	 title_capital_letters_count
0.0685 	 cos_date
0.0677 	 sin_date
0.0256 	 is_self
0.0213 	 words_in_body
0.0192 	 title_emojis_ratio
0.0107 	 body_capital_letters_ratio
0.0098 	 title_emojis
0.0094 	 body_capital_letters_count
0.0094 	 title_emoji_count
0.0079 	 is_oc
0.0034 	 body_urls_count
0.0025 	 body_emojis_ratio
0.0016 	 body_emoji_count
0.0009 	 body_emojis
0.0007 	 image_text_words
0.0005 	 imgs_labels
0.0004 	 imgs_colors
0.0001 	 imgs_count
0.0 	 image_text_capital_letters_count
0.0 	 image_text_capital_letters_ratio


## Features importance (without 2 most popular days)

In [12]:
X = data.copy()
X = X[~((X.created > 1611702000) & (X.created < 1611874799))]
Y = X['score']
X = X.drop(['created', 'score', 'comms_num', 'upvote_ratio'], axis=1)

In [13]:
clf = RandomForestRegressor(n_estimators=150,verbose=1,n_jobs=-1, min_samples_leaf=5)
_ = clf.fit(X, Y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    2.4s finished


In [14]:
imp = np.around(clf.feature_importances_, 4)
zipped = sorted(zip(X.keys(), imp), key=lambda x: x[1], reverse=True)

print("Imp.\t Column\n******\t ******************************")
for l, i in zipped:
    print(i, "\t", l)

Imp.	 Column
******	 ******************************
0.2032 	 cos_time
0.1635 	 sin_time
0.1542 	 title_capital_letters_ratio
0.1208 	 words_in_titles
0.101 	 title_capital_letters_count
0.0608 	 cos_date
0.0598 	 sin_date
0.0523 	 words_in_body
0.0219 	 title_emojis_ratio
0.0143 	 body_capital_letters_ratio
0.0102 	 title_emoji_count
0.01 	 body_capital_letters_count
0.0097 	 title_emojis
0.007 	 is_oc
0.0027 	 body_urls_count
0.0027 	 body_emojis_ratio
0.0022 	 is_self
0.0015 	 body_emoji_count
0.0007 	 body_emojis
0.0005 	 imgs_labels
0.0005 	 imgs_colors
0.0003 	 image_text_words
0.0002 	 imgs_count
0.0 	 image_text_capital_letters_count
0.0 	 image_text_capital_letters_ratio
