Importing Libraries

In [1]:
import numpy as np
import requests
import pandas as pd
import datetime
import time
import re, regex

Connecting to API

In [2]:
base_url = 'https://api.pushshift.io/reddit/submission/search'

Current time for first request.

In [3]:
time.time()

1587366134.4083178

Writing a function to pull and loop reddit posts to my specifications.  

In [73]:
def get_reddit(subreddit, n_iter):
    df_list = []
    current_time = 1587359145
    for _ in range(n_iter):
        res = requests.get(
            base_url,
            params={
                "subreddit": subreddit,
                "size": 1000,
                "lang": True,
                "before": current_time,
                "locked": False, 
                "mod_deleted": False, 
                "no_follow": False, 
                "selftext:not": "[removed]"  #this will not show posts with 'removed' as text
            }
        )
        df = pd.DataFrame(res.json()['data'])
        df = df.loc[:, ["created_utc", "title", "selftext", "subreddit"]]
        df_list.append(df)
        current_time = df.created_utc.min()
    return pd.concat(df_list, axis=0)

In [74]:
df_lpt = get_reddit("lifeprotips", 10)

In [75]:
df_lpt.isna().sum()

created_utc      0
title            0
selftext       626
subreddit        0
dtype: int64

Basic visual EDA

In [76]:
df_lpt

Unnamed: 0,created_utc,title,selftext,subreddit
0,1587359085,LPT: if an online video has no controls to ski...,,LifeProTips
1,1587359051,LPT I always judge people's maturity based on ...,,LifeProTips
2,1587358591,LPT REQUEST:- CAN A INDIAN INDIAN SEND A SCRIP...,,LifeProTips
3,1587358380,LPT Having small insect problems? Keep some st...,Obviously if you have a full on infestation ca...,LifeProTips
4,1587358145,"LPT: when painting in your home, wear your clo...",,LifeProTips
...,...,...,...,...
995,1582148446,LPT MOSTLY for men: Save money on a massage go...,Learned this while on a convention in Las Vega...,LifeProTips
996,1582148101,LPT: If you don’t recognize the first ingredie...,,LifeProTips
997,1582148059,LPT: Click and hold the back button on your ph...,,LifeProTips
998,1582146632,LPT for renters: always take pictures of your ...,This way you can't be charged for pre-existing...,LifeProTips


Pulling my 2nd subreddit

In [79]:
df_ulpt = get_reddit("unethicallifeprotips", 10)

In [80]:
df_ulpt

Unnamed: 0,created_utc,title,selftext,subreddit
0,1587353931,"ULPT: the digital age is just yes or no, or so...",,UnethicalLifeProTips
1,1587350160,ULPT: If you’re looking to buy something for s...,,UnethicalLifeProTips
2,1587348108,ULPT - Brand new printer for $20 at staples,I purchased a new printer just about 12 months...,UnethicalLifeProTips
3,1587347092,ULPT: Dogshit on your lawn? Shovel it onto the...,,UnethicalLifeProTips
4,1587344009,ULPT: If you ever have lots of garbage or larg...,,UnethicalLifeProTips
...,...,...,...,...
995,1567301008,ULPT Spray paint forward thinking and accepted...,"If they erase it, they look like a racist and/...",UnethicalLifeProTips
996,1567300352,ULPT: college student needing some pdf textboo...,Link for those who don't want to search for it...,UnethicalLifeProTips
997,1567294800,ULPT request: how do I make my dad lower his p...,So my dad basically always played music at 100...,UnethicalLifeProTips
998,1567293933,ULPT Request How to fake or get an easy doctor...,"Hello fellow degenerates,\n\nI'm looking for a...",UnethicalLifeProTips


Joining the data from the two subreddits

In [135]:
df = pd.concat([df_lpt, df_ulpt])

In [136]:
df

Unnamed: 0,created_utc,title,selftext,subreddit
0,1587359085,LPT: if an online video has no controls to ski...,,LifeProTips
1,1587359051,LPT I always judge people's maturity based on ...,,LifeProTips
2,1587358591,LPT REQUEST:- CAN A INDIAN INDIAN SEND A SCRIP...,,LifeProTips
3,1587358380,LPT Having small insect problems? Keep some st...,Obviously if you have a full on infestation ca...,LifeProTips
4,1587358145,"LPT: when painting in your home, wear your clo...",,LifeProTips
...,...,...,...,...
995,1567301008,ULPT Spray paint forward thinking and accepted...,"If they erase it, they look like a racist and/...",UnethicalLifeProTips
996,1567300352,ULPT: college student needing some pdf textboo...,Link for those who don't want to search for it...,UnethicalLifeProTips
997,1567294800,ULPT request: how do I make my dad lower his p...,So my dad basically always played music at 100...,UnethicalLifeProTips
998,1567293933,ULPT Request How to fake or get an easy doctor...,"Hello fellow degenerates,\n\nI'm looking for a...",UnethicalLifeProTips


Dropping time.

In [137]:
df.drop(columns=['created_utc'], inplace=True)

Dropping duplicates.

In [138]:
df.drop_duplicates(inplace=True)


In [139]:
df

Unnamed: 0,title,selftext,subreddit
0,LPT: if an online video has no controls to ski...,,LifeProTips
1,LPT I always judge people's maturity based on ...,,LifeProTips
2,LPT REQUEST:- CAN A INDIAN INDIAN SEND A SCRIP...,,LifeProTips
3,LPT Having small insect problems? Keep some st...,Obviously if you have a full on infestation ca...,LifeProTips
4,"LPT: when painting in your home, wear your clo...",,LifeProTips
...,...,...,...
995,ULPT Spray paint forward thinking and accepted...,"If they erase it, they look like a racist and/...",UnethicalLifeProTips
996,ULPT: college student needing some pdf textboo...,Link for those who don't want to search for it...,UnethicalLifeProTips
997,ULPT request: how do I make my dad lower his p...,So my dad basically always played music at 100...,UnethicalLifeProTips
998,ULPT Request How to fake or get an easy doctor...,"Hello fellow degenerates,\n\nI'm looking for a...",UnethicalLifeProTips


Resetting the index after dropping values.  

In [151]:
df.reset_index(drop=True, inplace=True)

In [152]:
df

Unnamed: 0,title,selftext,subreddit
0,"if an online video has no controls to skip, p...",,LifeProTips
1,I always judge people's maturity based on the...,,LifeProTips
2,REQUEST:- CAN A INDIAN INDIAN SEND A SCRIPT T...,,LifeProTips
3,Having small insect problems? Keep some stick...,Obviously if you have a full on infestation ca...,LifeProTips
4,"when painting in your home, wear your clothes...",,LifeProTips
...,...,...,...
19885,Spray paint forward thinking and accepted phr...,"If they erase it, they look like a racist and/...",UnethicalLifeProTips
19886,college student needing some pdf textbooks? U...,Link for those who don't want to search for it...,UnethicalLifeProTips
19887,request: how do I make my dad lower his pc vo...,So my dad basically always played music at 100...,UnethicalLifeProTips
19888,How to fake or get an easy doctors note,"Hello fellow degenerates,\n\nI'm looking for a...",UnethicalLifeProTips


Dropping problematic rows that have no title and are likely deleted.  

In [62]:
df.dropna(subset=['title'], inplace=True) 

Filling NaN values with nothing.

In [76]:
df.fillna(value="", inplace=True)

Combining Title and body text.  

In [78]:
df['text'] = df['title'] + " " + df['selftext']

Dropping separate columns

In [80]:
df.drop(columns=['title', 'selftext'], inplace=True)

Creating dummy variables for target column.

In [9]:
df = pd.get_dummies(df, columns=['subreddit'], drop_first=True)

Renaming columns to my specifications.  

In [20]:
df = df.rename(columns= {'text':'text', 'subreddit_UnethicalLifeProTips': 'unethical'})

Converting text to lower case.  

In [None]:
df['text'] = df['text'].str.lower()

Removing obvious markers.  

In [38]:
bad_words = ['lpt', 'lpt:', 'ulpt', 'ulpt:', 'ethical', 'unethical']

df['text'] = df['text'].map(lambda x: re.sub('ulpt:', '', x))

df['text'] = df['text'].map(lambda x: re.sub('ulpt', '', x))

df['text'] = df['text'].map(lambda x: re.sub('lpt:', '', x))

df['text'] = df['text'].map(lambda x: re.sub('lpt', '', x))

df['text'] = df['text'].map(lambda x: re.sub('ethical', '', x))

df['text'] = df['text'].map(lambda x: re.sub('unethical', '', x))


Saving cleaned dataset to CSV.  

In [40]:
df.to_csv("./data/reddit_df.csv", index=False)

One last quality check.  

In [39]:
df

Unnamed: 0,text,unethical
0,"if an online video has no controls to skip, p...",0
1,i always judge people's maturity based on the...,0
2,request:- can a indian indian send a script t...,0
3,having small insect problems? keep some stick...,0
4,"when painting in your home, wear your clothes...",0
...,...,...
19847,spray paint forward thinking and accepted phr...,1
19848,college student needing some pdf textbooks? u...,1
19849,request: how do i make my dad lower his pc vo...,1
19850,how to fake or get an easy doctors note hell...,1
