### Get a list of powerful authors in 'The_Donald' and 'politics' subreddits

pol users:  251153
don users:  145776
mutual users:  48168

In [138]:
import pandas as pd

def getAuthorProportion(file_path, subreddit):
    df = pd.read_csv(file_path, sep='\t', names=['author', 'score'])[1:]
    reddit_bots = ['[deleted]', 'AutoModerator'] # Ignore the reddit bots
    df = df.loc[df['author'].isin(reddit_bots) == False]
    df_score = df['score'].astype(float)[1:] # Change column type as float
    total_score = df_score.sum()
    # Output some statistics
    print('In {0}, total number of users: {1}'.format(subreddit, len(df)))
    print('In {0}, sum of upvote: {1}'.format(subreddit, total_score))
    print('In {0}, mean of upvote: {1}'.format(subreddit, df_score.mean()))
    print('In {0}, standard deviation of upvote: {1}'.format(subreddit, df_score.std()))
    
    score_sum = 0
    count = 1
    while score_sum < 0.8 * total_score:
        try:
            score_sum += int(df.iat[count, 1])
        except:
            pass
        count += 1
    # Calculate the proportion of authors who contribute 80% of total upvote
    print('{0}% of author in {1} contribute the 80% of total upvote'.format(round(count / len(df)*100, 2), subreddit)) 

In [139]:
pol_file_path = './power_author_analysis/politics_author_score.csv'
getAuthorProportion(pol_file_path, 'politics')

In politics, total number of users: 251152
In politics, sum of upvote: 29206680.0
In politics, mean of upvote: 116.29131478672193
In politics, standard deviation of upvote: 696.0300547006325
6.26% of author in politics contribute the 80% of total upvote


In [140]:
don_file_path = './power_author_analysis/The_Donald_author_score.csv'
getAuthorProportion(don_file_path, 'The_Donald')


In The_Donald, total number of users: 145775
In The_Donald, sum of upvote: 23966048.0
In The_Donald, mean of upvote: 164.40550441093748
In The_Donald, standard deviation of upvote: 776.8428421207984
7.97% of author in The_Donald contribute the 80% of total upvote


In [2]:
import os
import time
import pandas as pd

def getHighScoreAuth(file_path):
    
    files = os.listdir(file_path)
    author_scores = {}
    for file in files:
        filename = file_path + file
        # Read file and rename the columns
        df_highscore_au = pd.read_csv(filename, sep='\t')
        df_highscore_au = df_highscore_au.rename(columns={'Unnamed: 0':'author'})
        # Collect author scores from each separated file and group them together
        for author in df_highscore_au['author']:
            score = df_highscore_au.loc[df_highscore_au['author'] == author, ['scores']].values
            # Aggregate scores by authors
            if author not in author_scores.keys():
                author_scores[author] = int(score)
            else:
                author_scores[author] += int(score)

    # Convert the dict to DataFrame and sort by scores         
    df_score = pd.DataFrame(author_scores, index=['scores'])
    df_score = df_score.transpose()
    df_score = df_score.sort_values('scores', ascending=False)
    return df_score
            
    

In [3]:
filepath = './The_Donald_highScore/'
don_highScore_auth = getHighScoreAuth(filepath)
filepath = './politics_highScore/'
pol_highScore_auth = getHighScoreAuth(filepath)

In [8]:
def getHighCommAuth(file_path):
    files = os.listdir(file_path)
    author_comm = {}
    for file in files:
        # Get filename and read files
        filename = file_path + file
        df_comm_au = pd.read_csv(filename, sep='\t')
        df_comm_au = df_comm_au.rename(columns={'Unnamed: 0':'author'})
        for author in df_comm_au['author']:
            # Collect author quotations from each separated file and group them together
            comm_count = df_comm_au.loc[df_comm_au['author'] == author, ['comm_below']].values
            if author not in author_comm.keys():
                author_comm[author] = int(comm_count)
            else:
                author_comm[author] += int(comm_count)
    # Convert the quotations dict to DataFrame and sort by quotations
    df_comm = pd.DataFrame(author_comm, index=['comm_below'])
    df_comm = df_comm.transpose()
    df_comm = df_comm.sort_values('comm_below', ascending=False)
    return df_comm

In [9]:
filepath = './The_Donald_highlinked/'
don_highComm_auth = getHighCommAuth(filepath)
filepath = './politics_highlinked/'
pol_highComm_auth = getHighCommAuth(filepath)

### Find powerful author in both 'The_Donald' and 'politics' subreddits, considering the high socre comment and high linked comment behavior

In [10]:
# Extract the top 1000 high scored authors and high linked authors and do co-referencing by join two datframes
don_highComm_auth = don_highComm_auth[:1000]
don_highScore_auth = don_highScore_auth[:1000]
don_powerAuth = don_highComm_auth.join(don_highScore_auth, how='inner')
don_powerAuth.to_csv('./don_power_author.csv', sep='\t') # Save the powerful author lists in "The_Donald"

In [11]:
# Extract the top 1000 high scored authors and high linked authors and do co-referencing by join two datframes
pol_highComm_auth = pol_highComm_auth[:1000]
pol_highScore_auth = pol_highScore_auth[:1000]
pol_powerAuth = pol_highComm_auth.join(pol_highScore_auth, how='inner')
pol_powerAuth.to_csv('./pol_power_author.csv', sep='\t') # Save the powerful author lists in "politics"

### Top 100 highly linked/scored authors in 'The_Donald'

In [27]:
import plotly.plotly as py
import plotly.graph_objs as go

# Plot the top 100 highly linked and scored authors in 'The_Donald'
trace1 = go.Bar(
            x=don_powerAuth.index[:100],
            y=don_powerAuth['comm_below'].values[:100],
            name='total linked comments per author'
            )
trace2 = go.Bar(
            x=don_powerAuth.index[:100],
            y=-don_powerAuth['scores'].values[:100],
            name='total scores per author'
            )

data = [trace1, trace2]
layout = go.Layout(
    title='Powerful Author in The_Donald', 
    xaxis={"mirror" : "allticks", 'side': 'top'}, 
    yaxis={"mirror" : "allticks", 'side': 'right'}  
)

py.iplot(data, layout=layout, filename='basic-bar')

### Top 100 highly linked authors in 'politics'

In [26]:
import plotly.plotly as py
import plotly.graph_objs as go

# Plot the top 100 highly linked and scored authors in 'politics'
trace1 = go.Bar(
            x=pol_powerAuth.index[1:101],
            y=pol_powerAuth['comm_below'].values[1:101],
            name='total linked comments per author'
            )
trace2 = go.Bar(
            x=pol_powerAuth.index[1:101],
            y=-pol_powerAuth['scores'].values[1:101],
            name='total scores per author'
            )

data = [trace1, trace2]
layout = go.Layout(
    title="Powerful Author in politics",
    xaxis={"mirror" : "allticks", 'side': 'top'}, 
    yaxis={"mirror" : "allticks", 'side': 'right'}  
)

py.iplot(data, layout=layout, filename='basic-bar', title='Powerful Authors in politics')