# Identifying Active Users

## Import necessary libraries and initialize Reddit object with Reddit developer credentials

In [None]:
import pandas as pd
import praw

# Enter your credentials here
reddit = praw.Reddit(
    client_id = "my-client-id",
    client_secret = "my-client-secret",
    password = "my-password",
    user_agent = "my-user-agent",
    username = "my-username",
)

In [None]:
post_df = pd.read_csv("Datasets/sgexams_posts.csv")
comment_df = pd.read_csv("Datasets/sgexams_comments.csv")

In [None]:
grouped = post_df.groupby('author')
poster_activity['name'] = grouped.groups.keys()
poster_activity['posts'] = list(grouped.count()['body'])
poster_activity['first'] = list(grouped.min()['created_utc'])
poster_activity['last'] = list(grouped.max()['created_utc'])
poster_activity['range_in_months'] = poster_activity['last'] - poster_activity['first'] /2628000 # number of seconds in a month
poster_activity['posts_per_month'] = poster_activity['posts'] / poster_activity['range_in_months'] 

active_posters = poster_activity[poster_activity['posts_per_month'] >= 5]

In [None]:
grouped = comment_df.groupby('author')
commenter_activity['name'] = grouped.groups.keys()
commenter_activity['comments'] = list(grouped.count()['body'])
commenter_activity['first'] = list(grouped.min()['created_utc'])
commenter_activity['last'] = list(grouped.max()['created_utc'])
commenter_activity['range_in_months'] = commenter_activity['last'] - commenter_activity['first'] /2628000 # number of seconds in a month
commenter_activity['comments_per_month'] = commenter_activity['comments'] / commenter_activity['range_in_months'] 

active_commenters = commenter_activity.loc[(commenter_activity['comments_per_month'] >= 10) & (commenter_activity['range_in_months'] >= 12)]

# Collecting Activity Datasets

In [None]:
post_data = []
comment_data = []
failed = []

for name in active_posters['name']:
    user = reddit.redditor(name)
    
    try:
        for post in user.submissions.top(limit=1000):
            post_data.append({
                'id': post.name, 'title': post.title,
                'body': post.selftext, 'created_utc': post.created_utc,
                'author': name,
                'score': post.score, 'subreddit': post.subreddit
            })
            
    except:
        failed.append(name)
        continue

    try:
        for comment in user.comments.top(limit=1000):
            comment_data.append({
                'id': comment.name, 'parent_id': comment.parent_id,
                'body': comment.body, 'link_id': comment.link_id, 'created_utc': comment.created_utc,
                'author': name,
                'score': comment.score, 'subreddit': comment.subreddit
            })
    except:
        failed.append(name)
        continue

post_df = pd.DataFrame(post_data)
comment_df = pd.DataFrame(comment_data)

post_df['body'] = post_df['title'] + ' ### ' + post_df['body']
posts = pd.DataFrame(post_df[['id', 'body', 'created_utc', 'author', 'subreddit']])
comments = pd.DataFrame(comment_df[['id', 'body', 'created_utc', 'author', 'subreddit']])
alls = pd.concat([posts, comments], axis=0)
alls.to_csv('Datasets/Cleaned/SGposters_all.csv')

### Array "failed" contains usernames of users who have deleted their accounts or all their activity

In [None]:
print(failed)

In [None]:
post_data = []
comment_data = []
failed = []

for name in active_commenters['name']:
    user = reddit.redditor(name)
    
    try:
        for post in user.submissions.top(limit=1000):
            post_data.append({
                'id': post.name, 'title': post.title,
                'body': post.selftext, 'created_utc': post.created_utc,
                'author': name,
                'score': post.score, 'subreddit': post.subreddit
            })
            
    except:
        failed.append(name)
        continue

    try:
        for comment in user.comments.top(limit=1000):
            comment_data.append({
                'id': comment.name, 'parent_id': comment.parent_id,
                'body': comment.body, 'link_id': comment.link_id, 'created_utc': comment.created_utc,
                'author': name,
                'score': comment.score, 'subreddit': comment.subreddit
            })
    except:
        failed.append(name)
        continue

post_df = pd.DataFrame(post_data)
comment_df = pd.DataFrame(comment_data)

post_df['body'] = post_df['title'] + ' ### ' + post_df['body']
posts = pd.DataFrame(post_df[['id', 'body', 'created_utc', 'author', 'subreddit']])
comments = pd.DataFrame(comment_df[['id', 'body', 'created_utc', 'author', 'subreddit']])
alls = pd.concat([posts, comments], axis=0)
alls.to_csv('Datasets/Cleaned/SGcommenters_all.csv')

In [None]:
print(failed)