# Using Reddit's API for Predicting Comments

### Scraping Thread Info from Reddit.com

In [2]:
import requests
import json
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
import regex as re
from bs4 import BeautifulSoup 
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV, Lasso
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

  from numpy.core.umath_tests import inner1d


# Acquiring the information from Reddit

In [2]:
#comparing star wars vs. star trek reddits
url1 = "https://reddit.com/r/StarWars.json"
url2= "https://reddit.com/r/StarTrek.json"

In [3]:
#create loop to get 25 new posts at a time,
posts=[]
#initially set url with default address
after=None

#pull reddits for each url
for j in range(2):
    if j==0:
        url=url1
    else:
        url=url2
        
    after= None
    
    for i in range(25):

        #first time running loop on main url
        if after == None:
            current_url=url
        #change url page to next 
        else:
            current_url=url + '?after=' + after
        print(current_url)
        res=requests.get(current_url, headers={'User-agent': 'Sam 1.0'})
        #check web page status.  if not like 200, stop code
        if res.status_code !=200:
            print('Status error', res.status_code)
            break
        current_dict=res.json()
        #find all posts and add together
        current_posts=[p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        #find the next url and set page to pull from next
        after=current_dict['data']['after']
        #time in between pulling each subreddit
        time.sleep(2)


    #export to csv file
    pd.DataFrame(posts).to_csv('./files/posts.csv', index=False)   

https://reddit.com/r/StarWars.json
https://reddit.com/r/StarWars.json?after=t3_9fxcdl
https://reddit.com/r/StarWars.json?after=t3_9ftxhu
https://reddit.com/r/StarWars.json?after=t3_9fhivw
https://reddit.com/r/StarWars.json?after=t3_9fn02b
https://reddit.com/r/StarWars.json?after=t3_9frkyp
https://reddit.com/r/StarWars.json?after=t3_9fjdeo
https://reddit.com/r/StarWars.json?after=t3_9fdgk7
https://reddit.com/r/StarWars.json?after=t3_9feak5
https://reddit.com/r/StarWars.json?after=t3_9f23f4
https://reddit.com/r/StarWars.json?after=t3_9f2j8z
https://reddit.com/r/StarWars.json?after=t3_9ew1di
https://reddit.com/r/StarWars.json?after=t3_9et7zq
https://reddit.com/r/StarWars.json?after=t3_9eomhv
https://reddit.com/r/StarWars.json?after=t3_9erkae
https://reddit.com/r/StarWars.json?after=t3_9e4ryc
https://reddit.com/r/StarWars.json?after=t3_9e4ps6
https://reddit.com/r/StarWars.json?after=t3_9e5zk3
https://reddit.com/r/StarWars.json?after=t3_9e0v5y
https://reddit.com/r/StarWars.json?after=t3_9dt

## Clean up data

In [4]:
#import in csv file with all the posts and create a dataframe
df=pd.read_csv('./files/posts.csv')

In [5]:
#create dummy columns for star wars vs star trek to create target varaiable
df['target']=df['subreddit'].map({'StarWars':1,"startrek":0})

In [6]:
#change column to lowercase
df['title'] = df.title.astype(str).str.lower()
#remove punctuation
df["title"] = df['title'].str.replace('[^\w\s]','')

In [7]:
#look for null titles
df['title'].isnull().sum()

0

In [8]:
#only 1 exists ok to change to "null"
df['title']=df['title'].replace(np.nan,'null')
df['title'].isnull().sum()

0

In [9]:
df.shape

(1253, 98)

In [10]:
df.columns

Index(['approved_at_utc', 'approved_by', 'archived', 'author',
       'author_cakeday', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'author_fullname',
       'banned_at_utc', 'banned_by', 'can_gild', 'can_mod_post', 'category',
       'clicked', 'content_categories', 'contest_mode', 'created',
       'created_utc', 'crosspost_parent', 'crosspost_parent_list',
       'distinguished', 'domain', 'downs', 'edited', 'gilded', 'hidden',
       'hide_score', 'id', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_self', 'is_video',
       'likes', 'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media',
       'media_embed', 'media_metadata', 'media_only', 

In [11]:
#find url address for each row
dfurl=df['url']

In [12]:
#find subredit id for each row
subredditid=df['subreddit_id']
df['subreddit_id']

0       t5_2qi4s
1       t5_2qi4s
2       t5_2qi4s
3       t5_2qi4s
4       t5_2qi4s
5       t5_2qi4s
6       t5_2qi4s
7       t5_2qi4s
8       t5_2qi4s
9       t5_2qi4s
10      t5_2qi4s
11      t5_2qi4s
12      t5_2qi4s
13      t5_2qi4s
14      t5_2qi4s
15      t5_2qi4s
16      t5_2qi4s
17      t5_2qi4s
18      t5_2qi4s
19      t5_2qi4s
20      t5_2qi4s
21      t5_2qi4s
22      t5_2qi4s
23      t5_2qi4s
24      t5_2qi4s
25      t5_2qi4s
26      t5_2qi4s
27      t5_2qi4s
28      t5_2qi4s
29      t5_2qi4s
          ...   
1223    t5_2qixm
1224    t5_2qixm
1225    t5_2qixm
1226    t5_2qixm
1227    t5_2qixm
1228    t5_2qixm
1229    t5_2qixm
1230    t5_2qixm
1231    t5_2qixm
1232    t5_2qixm
1233    t5_2qixm
1234    t5_2qixm
1235    t5_2qixm
1236    t5_2qixm
1237    t5_2qixm
1238    t5_2qixm
1239    t5_2qixm
1240    t5_2qixm
1241    t5_2qixm
1242    t5_2qixm
1243    t5_2qixm
1244    t5_2qixm
1245    t5_2qixm
1246    t5_2qixm
1247    t5_2qixm
1248    t5_2qixm
1249    t5_2qixm
1250    t5_2qi

In [14]:
#change created_utc dtype from float64 to int to properly analyze number format
df['created_utc'] = pd.DataFrame(df['created_utc'], dtype='int')
df['created_utc'].dtypes
time_stamp=df['created_utc']

In [15]:
#create list from time stamp column
time_stamp_list=time_stamp.tolist()
time_stamp_list[0:10]

[1531300998,
 1536670618,
 1537016991,
 1537019636,
 1537010479,
 1536971978,
 1536961286,
 1537010250,
 1536991295,
 1537021382]

In [16]:
time_stamp_count=time_stamp.count()
time_stamp.count()

1253

In [17]:
#function to convert utc timestamp to readable times

def pretty_date(time=False):

    from datetime import datetime
    now = datetime.now()
    if type(time) is int:
        diff = now - datetime.fromtimestamp(time)
    elif isinstance(time,datetime):
        diff = round(now - time,2)
    elif not time:
        diff = round(now - now,2)
    second_diff = diff.seconds
    day_diff = diff.days

    if day_diff < 0:
        return ''

    if day_diff == 0:
        if second_diff < 10:
            return "just now"
        if second_diff < 60:
            return str(second_diff) + " seconds ago"
        if second_diff < 120:
            return "a minute ago"
        if second_diff < 3600:
            return str(second_diff / 60) + " minutes ago"
        if second_diff < 7200:
            return "an hour ago"
        if second_diff < 86400:
            return str(second_diff / 3600) + " hours ago"
    if day_diff == 1:
        return "Yesterday"
    if day_diff < 7:
        return str(day_diff) + " days ago"
    if day_diff < 31:
        return str(day_diff / 7) + " weeks ago"
    if day_diff < 365:
        return str(day_diff / 30) + " months ago"
    return str(day_diff / 365) + " years ago"


In [18]:
pretty_date(1531300998)

'3.5 months ago'

In [19]:
#create loop to iterate through each utc timestamps and convert to 'normal' values
pretty_date_new=[]

for j in time_stamp_list:
    pretty_date_updated=pretty_date(j)
    pretty_date_new.append(pretty_date_updated)

In [20]:
#add to df
pretty_date_new_df=pd.DataFrame(pretty_date_new)
pdn = pd.Series(pretty_date_new)
df['time_passed']=pdn.values

In [21]:
#find titles of each post
titles=df['title']
df['title'].head()

0                                          on opinions
1                         solo home release megathread
2    i was scoutmaster for a bsa national youth lea...
3    say what you will about solo im just so glad w...
4                          the real star wars universe
Name: title, dtype: object

In [22]:
#find subreddit names
subreddit=df['subreddit'].value_counts()
df['subreddit'].value_counts()

StarWars    627
startrek    626
Name: subreddit, dtype: int64

In [23]:
#length of time thread has been up
time_passed=df['time_passed'].value_counts()
df['time_passed'].value_counts()

1.4 months ago                   104
1.4333333333333333 months ago    103
1.3666666666666667 months ago    101
1.3333333333333333 months ago     98
1.5666666666666667 months ago     98
1.5333333333333334 months ago     95
1.4666666666666666 months ago     88
1.6 months ago                    81
1.3 months ago                    69
1.6333333333333333 months ago     62
1.5 months ago                    52
1.9 months ago                    38
1.7 months ago                    37
1.7666666666666666 months ago     35
1.8 months ago                    35
1.6666666666666667 months ago     33
1.7333333333333334 months ago     29
1.8333333333333333 months ago     29
1.9333333333333333 months ago     26
1.8666666666666667 months ago     22
1.9666666666666666 months ago     16
2.0 months ago                     1
3.5 months ago                     1
Name: time_passed, dtype: int64

In [24]:
#find number of comments per thread
num_comments=df['num_comments']
df['num_comments'].head()

0    740
1    223
2    160
3    179
4     45
Name: num_comments, dtype: int64

In [25]:
#export as a cleaned csv file
pd.DataFrame(df).to_csv('./files/cleanposts.csv', index=False)   