Created by [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [/r/dataisbeautiful](https://www.reddit.com/r/dataisbeautiful).
<hr>

# /r/DataIsBeautiful Monthly Battles

The subreddit /r/dataisbeautiful holds monthly competitions were participants are challenged to created a visualization using a specific dataset chosen for the month for a chance at [Reddit gold](https://www.reddit.com/coins). This notebook contains the code I used to collect and clean information on entries for every competition using [praw](https://praw.readthedocs.io/en/latest/) a Python API wrapper for reddit.com.

I was mostly interested in seeing which competitions were the most popular and interested new users so I limited multiple entries to only the first submitted one. This is by no means an "official" count, as deleted posts won't be counted twords any totals. Lastly all time was UTC coverted to EST, the time used for the subreddit.

In [1]:
import praw
import json
import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup

In [2]:
def get_monthly_battles(battles):
    data = {}
    for x in sub.search(battles, limit=50):

        # get month
        date = dt.datetime.fromtimestamp(x.created_utc)
        month = f'{date.month}-{date.year}'

        # get title
        title = x.title
        title = title.split(': ')[-1]

        # add values
        data.setdefault('id', []).append(x.id)
        data.setdefault('date_id', []).append(month)
        data.setdefault('date_posted', []).append(date)
        data.setdefault('title', []).append(title)
        
        df = pd.DataFrame(data)
        
    return df

In [3]:
def get_winners(html, date):
    data = {}
    soup = BeautifulSoup(html, 'lxml')
    hrefs = soup.find_all('a')
    lists = [x['href'] for x in hrefs]
    
    matchers = ['/u/','https:/']
    matching = [x for x in lists if any(y in x for y in matchers)]

    prior = 'placeholder'
    for m in matching:
        if not prior.startswith('/u/'):
            prior = m
            pass
        else:
            prior = m

        if m.startswith('/u/'):
            data.setdefault('user', []).append(m[3:])
            
    df = pd.DataFrame(data)
    
    # add winner
    df['award'] = 'honorable momention'
    df.iloc[0, 1] = 'winner'
    
    # add date
    prior = dt.datetime.strptime(date, '%m-%Y')
    prior = prior - dt.timedelta(days=1)
    prior_month = f'{prior.month}-{prior.year}'
    
    df['date_id'] = prior_month
    
    return df

In [4]:
def get_entries(entry, date):
    data = {}
    data.setdefault('user', []).append(str(entry.author))
    data.setdefault('date_id', []).append(date)
    data.setdefault('upvotes', []).append(entry.ups)

    # get day posted
    ts = dt.datetime.fromtimestamp(entry.created_utc)
    data.setdefault('date_entry', []).append(ts)

    # get first link in entry
    soup = BeautifulSoup(entry.body_html, 'lxml')
    hrefs = soup.find_all('a')
    if len(hrefs) > 0:
        link = [x['href'] for x in hrefs][:1][0]
    else:
        link = 'N/A'
    data.setdefault('link', []).append(link)

    return data

In [5]:
# setup praw
with open('config.json', 'r') as f:
    config = json.load(f)
    
reddit = praw.Reddit(
    client_id= config['client_id'],
    client_secret = config['client_secret'],
    user_agent = config['user_agent']
)

In [6]:
# search for monthly battles
sub = reddit.subreddit("dataisbeautiful")
battles = 'author:AutoModerator title:"Battle for the month of"'

# get a dataframe of battles 
battle_df = get_monthly_battles(battles)
battles = battle_df.to_dict(orient='records')

# lists to hold dataframes
win_frame = []
entries = []

# loop through battles collecting information
for battle in battles:
    date = battle['date_id']
    title = battle['title']
    submission = reddit.submission(id=battle['id'])
    
    counter = 0
    for entry in submission.comments:
        # get winners for prior month
        if counter == 0 and date != '1-2018':
            winners = entry.body_html
            winners = get_winners(winners, date)
            win_frame.append(winners)
            
        # get info on entry if entry wasnt deleted
        elif entry.author and counter != 0:
            data = get_entries(entry, date)
            df = pd.DataFrame(data)
            entries.append(df)
        
        # add to count
        counter += 1
        
win_df = pd.concat(win_frame)
entry_df = pd.concat(entries)

In [7]:
# save raw files
entry_df.to_csv('./data/raw/entries.csv', index=False)
win_df.to_csv('./data/raw/winners.csv', index=False)
battle_df.to_csv('./data/raw/competitions.csv', index=False)

In [8]:
# get winners in the main dataframe
df = entry_df.merge(win_df, on=['user', 'date_id'], how='left')

# get information on the competition in the main dataframe
df = df.merge(battle_df, on='date_id', how='left')

In [9]:
# covert time to est (the default for /r/dataisbeautiful)
df['date_entry'] = df['date_entry'].dt.tz_localize('US/Eastern')
df['date_posted'] = df['date_posted'].dt.tz_localize('US/Eastern')

# get the number of days after a competition posted that the user submitted 
df['days_till_entered'] = (df['date_entry'] - df['date_posted'])
df['days_till_entered'] = df['days_till_entered'].apply(lambda x: x.components.days)

# keep only first entry if more the one submissions
df = df.drop_duplicates(subset=['user', 'date_id'], keep='first')

In [10]:
# get a dataframe of unique first-time posters
users_df = df.groupby(['date_id', 'date_posted'])['user'].apply(lambda x: list(set(x))).reset_index()
users_df = users_df.sort_values(by='date_posted')

data = {}
master_list = []
for i, row in users_df.iterrows():
    current = row['user']
    
    # get count of users never before entered
    new_users = [x for x in current if x not in master_list]
    [master_list.append(x) for x in current if x not in master_list]
    
    data.setdefault('date_id', []).append(row['date_id'])
    data.setdefault('new_users', []).append(len(new_users))

users_df = pd.DataFrame(data)

In [11]:
# merge new_users into battle_df
battle_df = battle_df.merge(users_df, on='date_id', how='left')

# merge total_entries into battle_df
total_entries = df.groupby(['date_id']).size().reset_index(name='total_entries')
battle_df = battle_df.merge(total_entries, on='date_id', how='left')

In [12]:
# save cleaned/merged files
df.to_csv('./data/clean/dataisbeautiful_entries.csv', index=False)
battle_df.to_csv('./data/clean/dataisbeautiful_competitions.csv', index=False)