In [1]:
import numpy as np
import requests
import pandas as pd
import datetime
import time
import re, regex

In [2]:
base_url = 'https://api.pushshift.io/reddit/submission/search'

In [3]:
time.time()

1587720501.8997126

In [4]:
def get_reddit(subreddit, n_iter):
    df_list = []
    current_time = 1587359145
    for _ in range(n_iter):
        res = requests.get(
            base_url,
            params={
                "subreddit": subreddit,
                "size": 1000,
                "lang": True,
                "before": current_time,
                "locked": False, 
                "mod_deleted": False, 
                "no_follow": False, 
                "selftext:not": "[removed]"  #this will not show posts with 'removed' as text
            }
        )
        df = pd.DataFrame(res.json()['data'])
        df = df.loc[:, ["created_utc", "title", "selftext", "subreddit"]]
        df_list.append(df)
        current_time = df.created_utc.min()
    return pd.concat(df_list, axis=0)

In [5]:
df_scams = get_reddit("scams", 5)

In [6]:
df_scams.isna().sum()

created_utc    0
title          0
selftext       9
subreddit      0
dtype: int64

In [7]:
df_scams

Unnamed: 0,created_utc,title,selftext,subreddit
0,1587358920,Got this weird message. Guessing it's a scam?,,Scams
1,1587357186,When your friends post links to a FREE 2020 Je...,,Scams
2,1587356559,I’ve been scammed 5k,Sorry for any grammatical issues.\n\nShort sto...,Scams
3,1587354620,I got scammed-will I be able to get refund?,"Hi,\n\nI got problem.I bought something from o...",Scams
4,1587354426,"My dad found this website with ""great guitars""...",,Scams
...,...,...,...,...
995,1581366274,I think I just got scammed and feel so stupid,,Scams
996,1581365773,Is Google Opinion Rewards a Scam,So i found this app called \nGoogle Opinion Re...,Scams
997,1581365770,Is Google Opinion Rewards a Scam,So i found this app called \nGoogle Opinion Re...,Scams
998,1581365104,I'm going to Jail (Short),"First time poster, but just a quick one I had ...",Scams


In [78]:
#df_lpt.loc[(df_lpt['selftext'] == '[removed]')]

In [66]:
#df_lpt.drop(index=df_lpt.loc[(df_lpt['selftext'] == '[removed]')].index, inplace=True)

In [8]:
df_scams.drop(columns=['created_utc'], inplace=True)

In [9]:
df_scams.drop_duplicates(inplace=True)


In [10]:
df_scams

Unnamed: 0,title,selftext,subreddit
0,Got this weird message. Guessing it's a scam?,,Scams
1,When your friends post links to a FREE 2020 Je...,,Scams
2,I’ve been scammed 5k,Sorry for any grammatical issues.\n\nShort sto...,Scams
3,I got scammed-will I be able to get refund?,"Hi,\n\nI got problem.I bought something from o...",Scams
4,"My dad found this website with ""great guitars""...",,Scams
...,...,...,...
994,Always check the emails lol,,Scams
995,I think I just got scammed and feel so stupid,,Scams
996,Is Google Opinion Rewards a Scam,So i found this app called \nGoogle Opinion Re...,Scams
998,I'm going to Jail (Short),"First time poster, but just a quick one I had ...",Scams


In [11]:
df_scams.reset_index(drop=True, inplace=True)

In [12]:
df_scams

Unnamed: 0,title,selftext,subreddit
0,Got this weird message. Guessing it's a scam?,,Scams
1,When your friends post links to a FREE 2020 Je...,,Scams
2,I’ve been scammed 5k,Sorry for any grammatical issues.\n\nShort sto...,Scams
3,I got scammed-will I be able to get refund?,"Hi,\n\nI got problem.I bought something from o...",Scams
4,"My dad found this website with ""great guitars""...",,Scams
...,...,...,...
4843,Always check the emails lol,,Scams
4844,I think I just got scammed and feel so stupid,,Scams
4845,Is Google Opinion Rewards a Scam,So i found this app called \nGoogle Opinion Re...,Scams
4846,I'm going to Jail (Short),"First time poster, but just a quick one I had ...",Scams


In [14]:
df_scams.dropna(subset=['title'], inplace=True) 

In [15]:
df_scams.fillna(value="", inplace=True)

In [16]:
df_scams['text'] = df_scams['title'] + " " + df_scams['selftext']

In [17]:
df_scams.drop(columns=['title', 'selftext'], inplace=True)

In [18]:
df_scams = pd.get_dummies(df_scams, columns=['subreddit'], drop_first=True)

In [19]:
df_scams = df_scams.rename(columns= {'text':'text', 'subreddit_UnethicalLifeProTips': 'unethical'})

In [20]:
df_scams['text'] = df_scams['text'].str.lower()

In [21]:
df_scams.to_csv("./data/scam_df.csv", index=False)

In [22]:
df_scams

Unnamed: 0,text
0,got this weird message. guessing it's a scam?
1,when your friends post links to a free 2020 je...
2,i’ve been scammed 5k sorry for any grammatical...
3,i got scammed-will i be able to get refund? hi...
4,"my dad found this website with ""great guitars""..."
...,...
4843,always check the emails lol
4844,i think i just got scammed and feel so stupid
4845,is google opinion rewards a scam so i found th...
4846,"i'm going to jail (short) first time poster, b..."
