### Get authors in 'The_Donald' and 'politics' subreddits

In [1]:
import os
import time

start_time = time.process_time()
# Read author list from previously extracted author file
don_author_list = []
with open('./authorList/donList.txt') as file:
    for author in file:
        don_author_list.append(author.split('\n')[0])
pol_author_list = []
with open('./authorList/polList.txt') as file:
    for author in file:
        pol_author_list.append(author.split('\n')[0])
# Read mutual author list from previously extracted author file
mutual_author_list = []
with open('./authorList/mutualAuthorList.txt') as file:
    for author in file:
        mutual_author_list.append(author.split('\n')[0])

end_time = time.process_time()
print('running time: ', (end_time-start_time))
print('authors in The_Donald: ', len(don_author_list))
print('authors in politics: ', len(pol_author_list))
print('authors in both The_Donald and politics: ', len(mutual_author_list))

running time:  0.2668570000000001
authors in The_Donald:  145776
authors in politics:  251153
authors in both The_Donald and politics:  48168


### Sentimental Score Distribution for each user in 'The_Donald' and 'politics'

In [2]:
import os
import pandas as pd
import plotly.plotly as py
import plotly.figure_factory as ff

def gextAuthorSentiscores(file_path, folder_path):
    author_list = []
    with open(file_path) as file:
        for author in file:
            author_list.append(author.split('\n')[0])
    # Initialize dict with value 0 and key are the authors
    author_comm_count = {key: 0 for key in author_list}
    author_scores = {key: 0 for key in author_list}

#     folder_path = './The_Donald'
    files = os.listdir(folder_path)
    process = 0
    file_num = len(files)
    
    for file in files:
        process += 1
        filename = '{0}/{1}'.format(folder_path, file)
        try:
            df = pd.read_csv(filename, sep='\t')
            authors = df['author']
            for author in authors:
                author_comm_count[author] += 1
            for author in authors.unique(): # Get the sum of senti_score for each author
                author_scores[author] += df.loc[df['author'] == author, ['senti_score']].sum()
            print('Processing {0}/{1}'.format(process, file_num))
        except:
            pass
            
        author_avgScore = {}
        count_limit = 50 # Only concern those authors who post more than 50 comments
        for author in author_comm_count.keys():
            try:
                if author_comm_count[author] > count_limit: # Calculate the average senti_score for each author
                    author_avgScore[author] = author_scores[author] / author_comm_count[author]
            except:
                pass

    # Convert average author senti_score dict to DataFrame, sort by senti_score
    df_avgScore = pd.DataFrame(author_avgScore)
    df_avgScore = df_avgScore.transpose().sort_values('senti_score', ascending=False)
    return df_avgScore['senti_score'].values



In [None]:
# Calculate average senti_score for each author in 'The_Donald' subreddit
don_file_path = './donList.txt' # Path to the author list in 'The_Donald'
don_folder_path = './The_Donald' # Previous extracted data folder
don_authors_senti = getAuthorSentiscores(don_file_path, don_folder_path)
# don_authors_senti.to_csv('./sentiscore_analysis/don_author.csv', sep='\t')

# Calculate average senti_score for each author in 'politics' subreddit
pol_file_path = './polList.txt' # Path to the author list in 'politics'
pol_folder_path = './politics'
pol_authors_senti = getAuthorSentiscores(pol_file_path, pol_folder_path)
# pol_authors_senti.to_csv('./sentiscore_analysis/pol_author.csv', sep='\t')


#Group data together
hist_data = [don_authors_senti, pol_authors_senti]

group_labels = ['don_senti_scores', 'pol_senti_scores']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, bin_size=.2)

# Plot
py.iplot(fig, filename='Average Sentimental Score Distribution in two subreddits')

### Mutual author behavior in  'The_Donald' and 'politics'

In [2]:
import os
import pandas as pd
import json
from textblob import TextBlob
import plotly.plotly as py
import plotly.figure_factory as ff

def muAuthSentiSubreddit(file_path, mu_auth_path, subreddit):
    # Read the mutual author list
    mutual_author_list = []
    with open(mu_auth_path, 'r') as file:
        for author in file:
            mutual_author_list.append(author.split('\n')[0])
    
    scores_list = []
    with open(file_path, 'r+') as reddit_file:
        for line in reddit_file:
            if line:
                try:
                    jsn = json.loads(line)
                    # Get the sentimental score where their mutual authors are also active in specific subreddit
                    if jsn['subreddit'] == subreddit and jsn['author'] in mutual_author_list:
                        statement = jsn['body']
                        sentiment = TextBlob(statement)
                        score = sentiment.sentiment.polarity # Calculate polarity score
                        subjectivity = sentiment.sentiment.subjectivity # # Calculate subjectivity score
                        if subjectivity > 0.3:
                            scores_list.append(score)
                except:
                    pass
    return scores_list

In [None]:
file_path = './cleands2' # path to the dataset
mu_auth_path = './authorList/mutualAuthorList.txt' # path to the mutual author list
don_score_list = muAuthSentiSubreddit(file_path, mu_auth_path, 'The_Donald') 
pol_score_list = muAuthSentiSubreddit(file_path, mu_auth_path, 'politics') 

#Group data together
hist_data = [don_score_list, pol_score_list]

group_labels = ['senti_score in "The_Donald"', 'senti_score in "politics"']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

# Plot!
py.iplot(fig, filename='Mutual_Author_Sentimental_Score_Distribution')

### Mutual author behavior comparison with other subreddits, like 'AskReddit', 'news'

In [50]:
def sentiScoreComparison(score_list, don_score_list, pol_score_list):
   
    #Group data together

    hist_data = [don_score_list, pol_score_list, score_list]

    group_labels = ['senti_score in "The_Donald"', 'senti_score in "politics"', 'senti_score in "news"']

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

#     Plot!
    return py.iplot(fig, filename='Mutual_Author_Sentimental_Score_Distribution Comparison')

In [None]:
# Get sentimental score for mutual author in 'news' subreddit
news_score_list = muAuthSentiSubreddit(file_path, mu_auth_path, 'news') 
# Plot comparison figure
sentiScoreComparison(news_score_list, don_score_list, pol_score_list)

# Get sentimental score for mutual author in 'AskReddit' subreddit
askreddit_score_list = muAuthSentiSubreddit(file_path, mu_auth_path, 'AskReddit')
# Plot comparison figure
sentiScoreComparison(askreddit_score_list, don_score_list, pol_score_list)

# Get sentimental score for mutual author in 'world_news subreddit
worldnews_score_list = muAuthSentiSubreddit(file_path, mu_auth_path, 'world_news')
# Plot comparison figure
sentiScoreComparison(worldnews_score_list, don_score_list, pol_score_list)

### Sentimental Score Distribution per post in 'The_Donald' and 'politics

In [None]:
import os
import pandas as pd

import plotly.plotly as py
import plotly.figure_factory as ff

import numpy as np

def getSentiScore(file_path):
    high_comm_files = os.listdir(file_path)
    
    senti_scores = []
    
    for file in high_comm_files:
        filename = file_path + file
        try:
            df_high_comm = pd.read_csv(filename, sep='\t')
            senti_scores += df_high_comm['senti_score'].tolist() # Obtain senti_score to score list
        except:
            pass
    return senti_scores

don_file_path = './The_Donald/'
don_senti_score = getSentiScore(don_file_path)
pol_file_path = './politics/'
pol_senti_score = getSentiScore(pol_file_path)


#Group data together
# Use sample data, take 10K data to plot histgram
hist_data = [don_senti_score[:10000], pol_senti_score[:10000]]

group_labels = ['don_senti_score', 'pol_senti_score']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

# Plot!
py.iplot(fig, filename='Sentimental Score Distribution in two subreddits')

### Sentimental Score Distribution according to some key words

In [3]:
import os
import pandas as pd

import plotly.plotly as py
import plotly.figure_factory as ff

import numpy as np

def keyWordsSentiDistribution(file_path, subreddit, keywords):
    scores_list = []
    with open(file_path, 'r+') as reddit_file:
        for line in reddit_file:
            if line:
                try:
                    jsn = json.loads(line)
                    # Get the sentimental score where keywords appear in the comment body
                    if jsn['subreddit'] == subreddit and keywords.lower() in jsn['body'].lower():
                        statement = jsn['body']
                        sentiment = TextBlob(statement)
                        score = sentiment.sentiment.polarity # Calculate polarity score
                        score = round(float(score), 4)
                        scores_list.append(score)
                except:
                    pass
    return scores_list

def KeyWordsComparison(subreddit, keywords):
    file_path = './cleands2' # path to the dataset
    keywords_score_list = {}
    for i in range(len(subreddit)):
        keywords_score_list[i] = keyWordsSentiDistribution(file_path, subreddit[i], keywords)
    
    #Group data together

    hist_data = [keywords_score_list[0], keywords_score_list[1], keywords_score_list[2]]

    group_labels = ['senti_score in "The_Donald"', 'senti_score in "politics"', 'senti_score in "news"']

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels,show_hist=False, bin_size=.2)

    # Plot!
    return py.iplot(fig, filename='Sentimental Score Distribution about certian keywords')


### Keywords like 'Trump', 'Hillary' might be interesting

In [None]:
# Sentimental score distribution about 'trump'
KeyWordsComparison(['The_Donald', 'politics', 'news'], 'trump')

# Sentimental score distribution about 'trump'
KeyWordsComparison(['The_Donald', 'politics', 'news'], 'hillary')