# This notebook was run for r/SGExams and r/teenagers
Change the variable below as needed

In [None]:
sub = "sgexams"

# Identifying Active Users

## Import necessary libraries and initialize Reddit object with Reddit developer credentials

In [None]:
import pandas as pd
import praw

# Enter your credentials here
reddit = praw.Reddit(
    client_id = "my-client-id",
    client_secret = "my-client-secret",
    password = "my-password",
    user_agent = "my-user-agent",
    username = "my-username",
)

## Load datasets saved from get_reddit_data.ipynb

In [None]:
post_df = pd.read_csv("Datasets/"+ sub + "_posts.csv")
comment_df = pd.read_csv("Datasets/"+ sub + "_comments.csv")

## Get attributes of each user:
- Username
- Number of posts/comments made in the chosen subreddit
- UNIX time of first post/comment
- UNIX time of last post/comment
- Number of months between first and last post/comment
- Mean number of posts/comments per month

In [None]:
grouped = post_df.groupby('author')
poster_activity['name'] = [name for name,unused_df in grouped]
poster_activity['num_posts'] = list(grouped.count()['body'])
poster_activity['first'] = list(grouped.min()['created_utc'])
poster_activity['last'] = list(grouped.max()['created_utc'])
poster_activity['range_in_months'] = poster_activity['last'] - poster_activity['first'] /2628000 # number of seconds in a month
poster_activity['posts_per_month'] = poster_activity['num_posts'] / poster_activity['range_in_months'] 

active_posters = poster_activity[poster_activity['posts_per_month'] >= 5]

In [None]:
grouped = comment_df.groupby('author')
commenter_activity['name'] = [name for name,unused_df in grouped]
commenter_activity['num_comments'] = list(grouped.count()['body'])
commenter_activity['first'] = list(grouped.min()['created_utc'])
commenter_activity['last'] = list(grouped.max()['created_utc'])
commenter_activity['range_in_months'] = commenter_activity['last'] - commenter_activity['first'] /2628000 # number of seconds in a month
commenter_activity['comments_per_month'] = commenter_activity['num_comments'] / commenter_activity['range_in_months'] 

active_commenters = commenter_activity.loc[(commenter_activity['comments_per_month'] >= 10) & (commenter_activity['range_in_months'] >= 12)]

# Collecting Activity Datasets
Repeat the whole process for commenters by replacing active_posters with active_commenters

In [None]:
post_data = []
comment_data = []
failed = []

for name in active_posters['name']: 
    user = reddit.redditor(name)
    
    try:
        for post in user.submissions.top(limit=1000):
            post_data.append({
                'id': post.name, 'title': post.title,
                'body': post.selftext, 'created_utc': post.created_utc,
                'author': name,
                'score': post.score, 'subreddit': post.subreddit
            })
            
    except:
        failed.append(name)
        continue

    try:
        for comment in user.comments.top(limit=1000):
            comment_data.append({
                'id': comment.name, 'parent_id': comment.parent_id,
                'body': comment.body, 'link_id': comment.link_id, 'created_utc': comment.created_utc,
                'author': name,
                'score': comment.score, 'subreddit': comment.subreddit
            })
    except:
        failed.append(name)
        continue

post_df = pd.DataFrame(post_data)
comment_df = pd.DataFrame(comment_data)

# Array "failed" contains usernames of users who have deleted their accounts
print(failed)

### Concatenate the titles and body text of posts and combine post and comment datasets into one activity dataset

In [None]:
post_df['body'] = post_df['title'] + ' ### ' + post_df['body']
posts = pd.DataFrame(post_df[['id', 'body', 'created_utc', 'author', 'subreddit']])
comments = pd.DataFrame(comment_df[['id', 'body', 'created_utc', 'author', 'subreddit']])
activity = pd.concat([posts, comments], axis=0)

In [None]:
activity.to_csv("Datasets/Cleaned/"+ sub + "_posters_activity.csv", index = False)

In [None]:
activity.to_csv("Datasets/Cleaned/"+ sub + "_commenters_activity.csv", index = False)