In [4]:
import datetime as dt
from steemdata import SteemData

import pandas as pd
import numpy as np

try:
    import plotly.plotly as py
    import plotly.graph_objs as go
    import cufflinks as cf
except:
    !pip install plotly
    !pip install cufflinks
    
# helpers
from toolz import keyfilter

def keep(d, whitelist):
    return keyfilter(lambda k: k in whitelist, d)

def omit(d, blacklist):
    return keyfilter(lambda k: k not in blacklist, d)

In [5]:
db = SteemData().db

## Methods

In [36]:
import json

import requests as rs
from steem.post import Post


def get_csrf(url):
    resp = rs.get(url)

    if resp.status_code == 200:
        body = resp.text
        start_index = body.find('"csrf":"')
        csrf = body[start_index + 8:start_index + 44]
        return csrf, resp.cookies


def get_view_count(url_partial, csrf, cookies):
    payload = {
        "csrf": csrf,
        "page": url_partial,
        "ref": "",
    }

    url = 'https://steemit.com/api/v1/page_view'
    resp = rs.post(url, json=payload, cookies=cookies)

    if resp.status_code == 200 and resp.text:
        return json.loads(resp.text)

    return {}


def get_views(identifier):
    p = Post(identifier)
    csrf, cookies = get_csrf('https://steemit.com' + p.url)
    return get_view_count(p.url, csrf, cookies).get('views', -1)

## Top Viewed Posts

In [33]:
# time constraints
time_constraints = {
    '$gte': dt.datetime.now() - dt.timedelta(days=7),
}
conditions = {
    'created': time_constraints,
#     'net_rshares': {'$gt': 0},
#     'author_reputation': {'$gt': 0},
    'net_votes': {'$gt': 100},
}
projection = {
    '_id': 0,
    'identifier': 1,
    'net_votes': 1,
    'net_rshares': 1,
    'author_reputation': 1,
    'title': 1,
    'author': 1,
    'pending_payout_value': 1,
    'total_payout_value': 1,
}
posts = list(db['Posts'].find(conditions, projection=projection))

In [34]:
def filter_spam(post):
    if int(post['net_rshares']) < 0 or int(post['author_reputation']) < 0:
        return False
    return True

In [37]:
posts = list(filter(filter_spam, posts))

In [38]:
len(posts)

1940

In [45]:
from funcy import silent
posts2 = [{**x, 'views': silent(get_views)(x['identifier'])} for x in posts]

In [50]:
posts3 = [{
    **x,
    'pending_payout_value': x['pending_payout_value']['amount'],
    'total_payout_value': x['total_payout_value']['amount'],
} for x in posts2]

In [57]:
df = pd.DataFrame(posts3)

In [58]:
df2 = df[['author', 'identifier', 'net_votes', 'pending_payout_value', 'views']].dropna()
df2['views'] = df2['views'].apply(int)

In [65]:
df3 = df2.sort_values('views', ascending=False).head(25)
df3

Unnamed: 0,author,identifier,net_votes,pending_payout_value,views
31,mys,@mys/remember-this-210-ltc-worth-puzzle-posted...,295,403.935,7284
1939,teamsteem,@teamsteem/some-steem-stats,1301,2378.384,3990
1090,thecryptofiend,@thecryptofiend/what-is-bitcoin-a-simple-guide...,711,602.051,2599
1840,maa,@maa/2017-5-22,142,166.567,2507
726,the-alien,@the-alien/it-s-not-too-late-to-get-into-crypt...,649,649.467,2384
1045,jerrybanfield,@jerrybanfield/10-reasons-for-usd10-steem-pric...,890,819.239,2327
1318,charlieshrem,@charlieshrem/steem-token-will-be-added-to-jax...,1162,1813.234,2301
1851,sweetsssj,@sweetsssj/travel-with-me-57-koh-samui-the-isl...,744,954.08,2288
1330,maa,@maa/46mpgf,115,164.862,2128
924,maa,@maa/4n5pp8,106,105.243,1973


In [124]:
# df3.values

In [None]:
from prettytable import PrettyTable

t = PrettyTable([])

## Top Posts by Languages

In [92]:
# time constraints
time_constraints = {
    '$gte': dt.datetime.now() - dt.timedelta(days=7),
}
conditions = {
    'created': time_constraints,
    'net_votes': {'$gt': 3},
    'children': {'$gt': 1},
}
projection = {
    '_id': 0,
    'identifier': 1,
    'title': 1,
    'author': 1,
    'body': 1,
}
lang_posts = list(db['Posts'].find(conditions, projection=projection))

In [109]:
len(lang_posts)

7443

In [123]:
from langdetect import detect_langs
from funcy.colls import pluck
from funcy.seqs import first, last
from toolz.functoolz import compose, thread_last
from contextlib import suppress
from collections import Counter

In [111]:
def detect(body):
    with suppress(Exception):
        langs = detect_langs(body)
        if langs:
            return first(langs)
    
    return []

In [117]:
languages = thread_last(
    filter(lambda x: len(x['body']) > 100, lang_posts),
    (pluck, 'body'),
    (map, detect),
    (filter, bool)
)

In [118]:
languages = [x.lang for x in languages if x and x.prob > 0.8]

In [121]:
c = Counter(languages)

In [122]:
c.most_common(10)

[('en', 6219),
 ('ko', 369),
 ('es', 167),
 ('de', 137),
 ('id', 52),
 ('hr', 20),
 ('pl', 13),
 ('tr', 12),
 ('af', 11),
 ('tl', 10)]

In [145]:
normalized = [{'language': first(x), 'pct_share': round(last(x) / len(languages) * 100, 3)} for x in c.most_common(10)]

In [146]:
normalized

[{'language': 'en', 'pct_share': 88.15},
 {'language': 'ko', 'pct_share': 5.23},
 {'language': 'es', 'pct_share': 2.367},
 {'language': 'de', 'pct_share': 1.942},
 {'language': 'id', 'pct_share': 0.737},
 {'language': 'hr', 'pct_share': 0.283},
 {'language': 'pl', 'pct_share': 0.184},
 {'language': 'tr', 'pct_share': 0.17},
 {'language': 'af', 'pct_share': 0.156},
 {'language': 'tl', 'pct_share': 0.142}]

In [149]:
df = pd.DataFrame(normalized)
df.index = range(1,len(df)+1)

In [150]:
df.head(10)

Unnamed: 0,language,pct_share
1,en,88.15
2,ko,5.23
3,es,2.367
4,de,1.942
5,id,0.737
6,hr,0.283
7,pl,0.184
8,tr,0.17
9,af,0.156
10,tl,0.142


In [134]:
import plotly.plotly as py
import plotly.graph_objs as go

labels = [first(x) for x in c.most_common(7)]
values = [last(x) for x in c.most_common(7)]
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

trace = go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='label', 
               textfont=dict(size=20),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2)))

py.iplot([trace], filename='styled_pie_chart')

In [151]:
## todo, create a distinct filter on author field, to count % as unique persons, not as number of posts