### Get authors in 'The_Donald' and 'politics' subreddits

In [1]:
import os
import pandas as pd
import time

start_time = time.process_time()
# Read author list from previously extracted author file
don_author_list = []
with open('./authorList/donList.txt') as file:
    for author in file:
        don_author_list.append(author.split('\n')[0])
pol_author_list = []
with open('./authorList/polList.txt') as file:
    for author in file:
        pol_author_list.append(author.split('\n')[0])
# Read mutual author list from previously extracted author file
mutual_author_list = []
with open('./authorList/mutualAuthorList.txt') as file:
    for author in file:
        mutual_author_list.append(author.split('\n')[0])

end_time = time.process_time()
print('running time: ', (end_time-start_time))
print('authors in The_Donald: ', len(don_author_list))
print('authors in politics: ', len(pol_author_list))
print('authors in both The_Donald and politics: ', len(mutual_author_list))

running time:  0.26112500000000005
authors in The_Donald:  145776
authors in politics:  251153
authors in both The_Donald and politics:  48168


### Sentimental Score Distribution for users in subreddits


In [2]:
import os
import pandas as pd


def getAuthorSentiscores(file_path, folder_path):
    author_list = []
    with open(file_path) as file:
        for author in file:
            author_list.append(author.split('\n')[0])
    # Initialize dict with value 0 and key are the authors
    author_comm_count = {key: 0 for key in author_list}
    author_scores = {key: 0 for key in author_list}

#     folder_path = './The_Donald'
    files = os.listdir(folder_path)
    process = 0
    file_num = len(files)
    
    for file in files:
        process += 1
        filename = '{0}/{1}'.format(folder_path, file)
        try:
            df = pd.read_csv(filename, sep='\t')
            authors = df['author']
            for author in authors:
                author_comm_count[author] += 1
            for author in authors.unique(): # Get the sum of senti_score for each author
                author_scores[author] += df.loc[df['author'] == author, ['senti_score']].sum()
            print('Processing {0}/{1}'.format(process, file_num))
        except:
            pass
            
        author_avgScore = {}
        count_limit = 50 # Only concern those authors who post more than 50 comments
        for author in author_comm_count.keys():
            try:
                if author_comm_count[author] > count_limit: # Calculate the average senti_score for each author
                    author_avgScore[author] = author_scores[author] / author_comm_count[author]
            except:
                pass

    # Convert average author senti_score dict to DataFrame, sort by senti_score
    df_avgScore = pd.DataFrame(author_avgScore)
    df_avgScore = df_avgScore.transpose().sort_values('senti_score', ascending=False)
    return df_avgScore



In [1]:
# Calculate average senti_score for each author in 'The_Donald' subreddit
don_file_path = './donList.txt'
don_folder_path = './The_Donald'
don_authors_senti = getAuthorSentiscores(don_file_path, don_folder_path)
don_authors_senti.to_csv('./sentiscore_analysis/don_author.csv', sep='\t')

In [2]:
# Calculate average senti_score for each author in 'politics' subreddit
pol_file_path = './polList.txt'
pol_folder_path = './politics'
pol_authors_senti = getAuthorSentiscores(pol_file_path, pol_folder_path)
pol_authors_senti.to_csv('./sentiscore_analysis/pol_author.csv', sep='\t')

In [1]:
import pandas as pd
import plotly.plotly as py
import plotly.figure_factory as ff

# Read file from previously extracted
don_file_path = './sentiscore_analysis/don_author.csv'
don_senti_score = pd.read_csv(don_file_path, sep='\t')
don_senti_scores = don_senti_score['senti_score'].values

pol_file_path = './sentiscore_analysis/pol_author.csv'
pol_senti_score = pd.read_csv(pol_file_path, sep='\t')
pol_senti_scores = pol_senti_score['senti_score'].values


In [3]:
print('length of don senti scores', len(don_senti_scores))
print('length of pol senti scores', len(pol_senti_scores))

length of don senti scores 10107
length of pol senti scores 15655


In [5]:
#Group data together
hist_data = [don_senti_scores, pol_senti_scores]

group_labels = ['don_senti_scores', 'pol_senti_scores']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, bin_size=.2)

# Plot
py.iplot(fig, filename='Average Sentimental Score Distribution in two subreddits')

### Mutual author behavior in  'The_Donald' and 'politics'

In [2]:
def getMuAuthSentiScore(mu_auth_path, file_path):
    # Extract the mutual author lists
    mutual_author_list = []
    with open(mu_auth_path) as file:
        for author in file:
            mutual_author_list.append(author.split('\n')[0])
    
    # Initialize the defalut dict to store the senti socres
    mu_author_dict = {key:1 for key in mutual_author_list}
    
    author_score = pd.read_csv(file_path, sep='\t', names=['author', 'senti_score'])
    # For each author, obtain the senti_score and save
    for author in mu_author_dict.keys():
        score = author_score.loc[author_score['author'] == author, ['senti_score']].values
        if len(score) > 0:
            mu_author_dict[author] = score[0][0]
    # Filter out those senti_score=1, which means no comment occurence for this author
    mu_author_score = {}
    for key, value in mu_author_dict.items():
        if value != 1:
            mu_author_score[key] = value
    mu_author_score = [round(float(x),4) for x in mu_author_score.values()]
    return mu_author_score

In [3]:
mu_auth_path = './authorList/mutualAuthorList.txt'
don_file_path = './sentiscore_analysis/don_author.csv'
pol_file_path = './sentiscore_analysis/pol_author.csv'
mu_author_don = getMuAuthSentiScore(mu_auth_path, don_file_path)
mu_author_pol = getMuAuthSentiScore(mu_auth_path, pol_file_path)

In [9]:
import os
import pandas as pd

import plotly.plotly as py
import plotly.figure_factory as ff


#Group data together
hist_data = [mu_author_don, mu_author_pol]

group_labels = ['don_senti_score', 'pol_senti_score']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

# Plot!
# py.iplot(fig, filename='Mutual_Author_Sentimental_Score_Distribution')

### Mutual author behavior comparison with other subreddits, like 'AskReddit'

In [8]:
import json
from textblob import TextBlob

def muAuthSentiSubreddit(file_path, mu_auth_path, subreddit):
    # Read the mutual author list
    mutual_author_list = []
    with open(mu_auth_path, 'r') as file:
        for author in file:
            mutual_author_list.append(author.split('\n')[0])
    
    scores_list = []
    with open(file_path, 'r+') as reddit_file:
        for line in reddit_file:
            if line:
                try:
                    jsn = json.loads(line)
                    # Get the sentimental score where their mutual authors are also active in specific subreddit
                    if jsn['subreddit'] == subreddit and jsn['author'] in mutual_author_list:
                        statement = jsn['body']
                        sentiment = TextBlob(statement)
                        score = sentiment.sentiment.polarity # Calculate polarity score
                        subjectivity = sentiment.sentiment.subjectivity # # Calculate subjectivity score
                        if subjectivity > 0.3:
                            scores_list.append(score)
                except:
                    pass
    return scores_list



In [50]:
# def sentiScoreComparison(score_path, don_score_list, pol_score_list):
def sentiScoreComparison(score_path):
    score_list = []
    with open(score_path, 'r') as file:
        for score in file:
            score = round(float(score.split('\n')[0]), 4)
            score_list.append(score)
    return score_list
    #Group data together

#     hist_data = [don_score_list, pol_score_list, score_list[:8000]]

#     group_labels = ['senti_score in "The_Donald"', 'senti_score in "politics"', 'senti_score in "news"']

#     # Create distplot with custom bin_size
#     fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

    # Plot!
#     return py.iplot(fig, filename='Mutual_Author_Sentimental_Score_Distribution Comparison')

In [60]:
don_path = './sentiscore_analysis/the_donald_score_list.csv'
don_list = sentiScoreComparison(don_path)[:6000]
pol_path = './sentiscore_analysis/politics_score_list.csv'
pol_list = sentiScoreComparison(pol_path)[:6000]
news_path = './sentiscore_analysis/news_score_list.csv'
news_list = sentiScoreComparison(news_path)[:6000]
world_news_path = './sentiscore_analysis/news_score_list.csv'
world_news_list = sentiScoreComparison(world_news_path)[:6000]
ask_reddit_path = './sentiscore_analysis/askreddit_score_list.csv'
askreddit_list = sentiScoreComparison(ask_reddit_path)[:6000]

In [62]:
hist_data = [don_list, pol_list, askreddit_list, news_list, world_news_list]

group_labels = ['senti_score in "The_Donald"', 'senti_score in "politics"', 'senti_score in "AskReddit"', 'senti_score in "news"', 'senti_score in "world_news"']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

    # Plot!
# py.iplot(fig, filename='Mutual_Author_Sentimental_Score_Distribution_Comparison')

### General Sentimental Score Distribution in 'The_Donald' and 'politics

In [None]:
import os
import pandas as pd

import plotly.plotly as py
import plotly.figure_factory as ff

import numpy as np

def getSentiScore(file_path):
    high_comm_files = os.listdir(file_path)
    
    senti_scores = []
    
    for file in high_comm_files:
        filename = file_path + file
        try:
            df_high_comm = pd.read_csv(filename, sep='\t')
            senti_scores += df_high_comm['senti_score'].tolist() # Obtain senti_score to score list
        except:
            pass
    return senti_scores

don_file_path = './The_Donald/'
don_senti_score = getSentiScore(don_file_path)
pol_file_path = './politics/'
pol_senti_score = getSentiScore(pol_file_path)


#Group data together
# Use sample data, take 10K data to plot histgram
hist_data = [don_senti_score[:10000], pol_senti_score[:10000]]

group_labels = ['don_senti_score', 'pol_senti_score']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

# Plot!
py.iplot(fig, filename='Sentimental Score Distribution in two subreddits')

### Sentimental Score Distribution according to some key words

In [1]:
import os
import pandas as pd

import plotly.plotly as py
import plotly.figure_factory as ff

import numpy as np

def getSentiScore(file_path, keywords):
    
    high_comm_files = os.listdir(file_path)
    
    senti_scores = []
    comments = []
    
    for file in high_comm_files:
        filename = file_path + file
        try:
            df_high_comm = pd.read_csv(filename, sep='\t')
            for i in range(len(df_high_comm)):
                # To check whether the comment body contains the keyword
                if keywords.lower() in df_high_comm['body'][i].lower():
#                     comments.append(df_high_comm['body'][i])
                    senti_scores.append(df_high_comm['senti_score'][i])
        except:
            pass
    return senti_scores         



def keyWordsSentiDistribution(keywords):
    don_file_path = './The_Donald/'
    pol_file_path = './politics/'
    don_senti_score = getSentiScore(don_file_path, keywords)
    pol_senti_score = getSentiScore(pol_file_path, keywords)
    
#     return don_senti_score[:10000], pol_senti_score[:10000]
    #Group data together
    hist_data = [don_senti_score[:10000], pol_senti_score[:10000]]

    group_labels = ['don_senti_score', 'pol_senti_score']

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)

    # Plot!
    return py.iplot(fig, filename='Sentimental Score Distribution')
    

In [None]:
def keyWordsSentiDistribution(file_path, subreddit, keywords):
    scores_list = []
    with open(file_path, 'r+') as reddit_file:
        for line in reddit_file:
            if line:
                try:
                    jsn = json.loads(line)
                    # Get the sentimental score where keywords appear in the comment body
                    if jsn['subreddit'] == subreddit and keywords in jsn['body']:
                        statement = jsn['body']
                        sentiment = TextBlob(statement)
                        score = sentiment.sentiment.polarity # Calculate polarity score
                        score = round(float(score), 4)
                        scores_list.append(score)
                except:
                    pass
    return scores_list
    

### Keywords like 'Trump', 'Hillary', 'Election' might be interesting 

In [3]:
keyWordsSentiDistribution('trump')

In [43]:
keyWordsSentiDistribution('Hillary')

In [41]:
keyWordsSentiDistribution('election')