### Extract related information about users in 'The_Donald' and 'politics' subreddits

In [2]:
import pandas as pd
import numpy as np
import json
import os
import time
import math
from textblob import TextBlob

def cleanComm(comment):
    comment = comment.replace('\n', '[lf}')
    comment = comment.replace('\r', '[cr}')
    comment = comment.replace('\t', '[ts}')
    comment = comment.replace('\\', '[2s}')
    comment = comment.replace('\'', '[ap}')
    comment = comment.replace('\"', '[dap}')
    return comment

def extractInformation(file_path, subreddit):
    start_time = time.process_time()
    # Get the total lines of the datasets, in order to read data into segementations
    with open(filename, 'r') as reddit_file:
        reddit_data = reddit_file.readlines()
        total_lines = len(reddit_data)
#     total_lines = 62558772
    curr_line = 0
    LIMIT = 2000000 # Group data with each 2 million lines
    line_count = LIMIT
    lines = []
    progress = 0 # To mark the process proportion
    with open(file_path, 'r+') as reddit_file: 
        for line in reddit_file:
            if line:
                try:
                    jsn = json.loads(line)
                    lines.append(jsn)
                    curr_line += 1
                except ValueError as err:
                    pass
            if curr_line == line_count: # Create a dictory to save processing data if not exist
                if not os.path.exists(subreddit):
                    os.makedirs(subreddit)
    
                filename = './{0}/data_{1}.csv'.format(subreddit, str(math.ceil(curr_line / LIMIT)))
                index_len = len(lines)
                #Convert JSON format file to DataFrame format
                df = pd.DataFrame([line for line in lines], index=[i for i in range(index_len)])
                # Specify the subreddit and some related information
                df = df[df['subreddit'] == subreddit]
                df_comm = df.loc[df['author'] != '[deleted]', ['author', 'score', 'body', 'link_id', 'created_utc']]

                senti_score_list = []
                for i in range(len(df_comm)):
                    score = 0
                    statements = df_comm.iat[i, 2] # locate the comment body
                    sentiment = TextBlob(statements) # Use TextBlob library to calculate the polarity score
                    scores = sentiment.sentiment.polarity
                    subjectivity = sentiment.sentiment.subjectivity
                    senti_score_list.append(scores)
                    df_comm.iat[i, 2] = cleanComm(df_comm.iat[i, 2]) # After obtaining the score, santinize the comment
                    
                df_comm['senti_score'] = pd.Series(senti_score_list, index=df_comm.index)
                # Write into csv format file for secondary processing
                df_comm.to_csv(filename, sep='\t', encoding='utf-8')
                
                if line_count + LIMIT < total_lines:
                    line_count += LIMIT
                else:
                    line_count = total_lines # Reset line count and starts new round
                lines = [] # Reset list to save data
                # Monitor the processing
                progress = curr_line
                print('{0} of {1} lines read ({2}%)'.format(progress, total_lines, int(progress / total_lines * 100)))
                end_time = time.process_time()
                print('running time: ', (end_time-start_time))


In [16]:
# file_path = './cleands2'
# extractInformation(file_path, 'The_Donald')
# extractInformation(file_path, 'politics')

### Extract highly scored author

In [6]:
def extractHighScoreAuth(file_path):
    # Use preprocessing data; list all files in specific folder
    files = os.listdir(file_path)
    file_count = len(files)
    count = 1
    process = 0
    for file in files:
        filename = file_path + file
        author_score_count = {}
        try:
            # Read csv file
            df_author_score = pd.read_csv(filename, sep='\t')
            # Read all rows with designated columns: author, score. Sort by score
            df_author_score = df_author_score.loc[:, ['author', 'score']]
            df_author_score = df_author_score.sort_values('score', ascending=False)

            # Calculate sum of scores by the same author
            for author in df_author_score['author'].unique():
                score_sum = df_author_score.loc[df_author_score['author'] == author, ['score']].sum()
                if author not in author_score_count.keys():
                    author_score_count[author] = int(score_sum)
                else:
                    author_score_count[author] += int(score_sum)
            # Convert to the DataFrame with authors and total socres, sort by scores
            df_scores = pd.DataFrame(author_score_count, index=['scores'], dtype='int64')
            df_scores = df_scores.transpose()
            df_scores = df_scores.sort_values('scores', ascending=False)
            
            # Filter those authors whose total obtained scores ranked top 200 in each separated file
            if len(df_scores) > 200:
                df_scores = df_scores[:200]
            
            # Generate filename and write into csv format file
            folderName = './' + filepath.split('/')[1] + '_highScore/'
            if not os.path.exists(folderName):
                    os.makedirs(folderName)
            to_file_name = '{0}highScore_{1}.csv'.format(folderName, count)
            df_scores.to_csv(to_file_name, sep='\t')
            print('Processing {0}/{1}'.format(count, file_count))
            count += 1
            author_score_count = {}
        except:
            pass
        
        

In [15]:
# filepath = './The_Donald/'
# extractHighScoreAuth(filepath)

In [14]:
# filepath = './politics/'
# extractHighScoreAuth(filepath)

### Extract highly linked author

In [9]:
import pandas as pd
import numpy as np
import json
import os
import time
import math

def extractHighComm(file_path):
    # Use preprocessing data; list all files in specific folder
    files = os.listdir(file_path)
    file_count = len(files)
    count = 1
    process = 0
    for file in files:
        try:
            filename = file_path + file
            author_comm_count = {}
            # Read all rows with designated columns: author, link_id, created_utc
            df_author_comm = pd.read_csv(filename, sep='\t')
            df_author_comm = df_author_comm.loc[:, ['author', 'link_id', 'created_utc']]
            link_id_count = df_author_comm['link_id'].value_counts()
            for link_id in link_id_count.index:
                # Find out the author who submit the comment, which is assumed to be the top level comment
                author = df_author_comm.loc[df_author_comm['link_id'] == link_id].sort_values('created_utc').iat[0,0]
                if author in author_comm_count.keys():
                    author_comm_count[author] += link_id_count[link_id]
                else:
                    author_comm_count[author] = link_id_count[link_id]
            
            # Convert to the DataFrame with authors and total quotations, sort by number of quotations
            df_comm = pd.DataFrame(author_comm_count, index=['comm_below'])
            df_comm = df_comm.transpose().sort_values('comm_below', ascending=False)
            # Filter the top 200 most frequently linked authors
            if len(df_comm) > 200:
                df_comm = df_comm[:200]
            
            # Generate filename and write into csv format file
            folderName = './' + filepath.split('/')[1] + '_highlinked/'
            if not os.path.exists(folderName):
                    os.makedirs(folderName)
            to_file_name = '{0}highLinked_{1}.csv'.format(folderName, count)
            df_comm.to_csv(to_file_name, sep='\t')
            # Monitor the processing
            print('Processing {0}/{1}'.format(count, file_count))
            count += 1
        except:
            pass

In [13]:
# filepath = './The_Donald/'
# extractHighComm(filepath)

In [12]:
# filepath = './politics/'
# extractHighComm(filepath)