# Data Collection

In [4]:
import pandas as pd
import numpy as np
import requests

In [5]:
# The function 'reddit_to_csv' will take three arguments: 
# 1. the subreddit being scraped; 2. the filename, or the name
# the csv file will be given; and 3. the number of requests 
# the user would like to make of reddit's API. 

def reddit_to_csv(subreddit, filename, n_requests=1):
    
    #Create an empty list to be used later in function:
    posts = []
    
    #Create User-Agent to avoid 429 res.status_code:
    headers = {'User-agent':'Eye eye bot 00'}
    
    #Establish that 'after' (a variable used later) is None type:
    after = None
    
    #for loop n_requests iterations (n_requests is established by user):
    for i in range(n_requests):
        print(i)
        
        if after == None:
            params = {}
        else:
            params = {'after': after}
        #Assign 'url' to reddit's base url, plus whatever subreddit 
        #the user provides,plus .json for clean results:
        #url = 'https://www.reddit.com/hot.json'
        url = 'https://www.reddit.com/' + str(subreddit) + '/.json'
        
        #Set my res variable equal to the results from requests.get, 
        #and the parameters set above like 'url' or 'params':
        res = requests.get(url,params=params,headers=headers)
        
        #Conditional statement to ensure access to the API is approved:
        if res.status_code ==200:
            the_json = res.json()
            
            for x in range(len(the_json['data']['children'])):
                
                #Create temporary dictionary to add results of each post to:
                temp_dict = {}
                #After looking through the json results, I've selected the below information about the posts
                #as those that can potentially add value to my model's results.
                temp_dict['subreddit'] = the_json['data']['children'][x]['data']['subreddit']
                temp_dict['title'] = the_json['data']['children'][x]['data']['title']
                temp_dict['post_paragraph'] = the_json['data']['children'][x]['data']['selftext']
                temp_dict['clicked'] = the_json['data']['children'][x]['data']['clicked']
                temp_dict['ups'] = the_json['data']['children'][x]['data']['ups']
                temp_dict['downs'] = the_json['data']['children'][x]['data']['downs']
                temp_dict['likes'] = the_json['data']['children'][x]['data']['likes']
                temp_dict['category'] = the_json['data']['children'][x]['data']['category']
                temp_dict['number_of_comments'] = the_json['data']['children'][x]['data']['num_comments']
                temp_dict['score'] = the_json['data']['children'][x]['data']['score']
                temp_dict['author_flair_css_class'] = the_json['data']['children'][x]['data']['author_flair_css_class']
                temp_dict['subreddit_type'] = the_json['data']['children'][x]['data']['subreddit_type']
                
                #Add the temporary dictionary to 'posts',the list of each post's dictionary of information:
                posts.append(temp_dict)
                #posts.extend(the_json['data']['children'])
            after = the_json['data']['after']
            
        else:
            print(res.status_code)
            break
        time.sleep(1)
        
    #Turn the list of post dictionaries into a pandas DataFrame:
    posts_df = pd.DataFrame(posts)
    
    #Drop any duplicate rows that may have been pulled:
    posts_df.drop_duplicates(inplace = True)
    
    #Rearrange the columns into a more logical order:
    posts_df = posts_df[['subreddit', 'title', 'clicked', 'ups', 'downs', 'post_paragraph', 'likes', 'number_of_comments', 'category', 'score', 'author_flair_css_class', 'subreddit_type']]
    
    #Save the DataFrame as a .csv file:
    posts_df.to_csv(str(filename), index = False, sep = ",")


In [3]:
#Load and save the data as CSV
reddit_to_csv(subreddit = 'r/Mario/', # Target Reddit for
              n_requests = 150,
              filename = 'mario_reddit_posts.csv')

0


NameError: name 'requests' is not defined

In [None]:
#Load and save the data as CSV
reddit_to_csv(subreddit = 'r/zelda',
              n_requests = 150,
              filename = 'zelda_reddit_posts.csv')

In [6]:
mario_df = pd.read_csv('./mario_reddit_posts.csv')
mario_df.head(2)

Unnamed: 0,subreddit,title,clicked,ups,downs,post_paragraph,likes,number_of_comments,category,score,author_flair_css_class,subreddit_type
0,Mario,Super Mario Movie Trailer MEGATHREAD,False,374,0,Want to discuss about the movie trailer? You'r...,,894,,374,,public
1,Mario,"Karma farming posts get removed, egregious rep...",False,113,0,"First up, apologies for these out of date rule...",,7,,113,,public


In [7]:
mario_df.shape

(2258, 12)

In [8]:
zelda_df = pd.read_csv('./zelda_reddit_posts.csv')
zelda_df.head(2)

Unnamed: 0,subreddit,title,clicked,ups,downs,post_paragraph,likes,number_of_comments,category,score,author_flair_css_class,subreddit_type
0,zelda,Today is Self-Post Sunday. Only self-posts are...,False,0,0,Self-Post Sundays are our main discussion day....,,0,,0,hylian,public
1,zelda,[All] Which of these Creeped You The Most in Z...,False,627,0,"Which of these locations, enemies, moments, or...",,258,,627,,public


**Append the Mario and Zelda files**

In [9]:
df = pd.concat([mario_df,zelda_df])

In [10]:
df.shape

(4712, 12)

**CSV CHECKPOINT (check the data again).**

In [11]:
df.head(5)

Unnamed: 0,subreddit,title,clicked,ups,downs,post_paragraph,likes,number_of_comments,category,score,author_flair_css_class,subreddit_type
0,Mario,Super Mario Movie Trailer MEGATHREAD,False,374,0,Want to discuss about the movie trailer? You'r...,,894,,374,,public
1,Mario,"Karma farming posts get removed, egregious rep...",False,113,0,"First up, apologies for these out of date rule...",,7,,113,,public
2,Mario,"If there are power ups in the movie, how do yo...",False,425,0,,,59,,425,,public
3,Mario,I'm still thinking about the Mario Movie so I ...,False,696,0,,,29,,696,,public
4,Mario,I understand the disappointment of Charles Mar...,False,381,0,,,94,,381,,public


In [12]:
zelda_df.shape

(2454, 12)

In [13]:
mario_df.shape

(2258, 12)

In [14]:
zelda_df.shape[0]+mario_df.shape[0]

4712

In [15]:
df.shape

(4712, 12)

**Create a 'target' column (will equal 1 if the post's subreddit is Mario, and 0 if the post's subreddit is Zelda):**

In [16]:
df['target'] = np.where(df['subreddit'] == 'Mario', 1, 0)

**Check missing value and drop columns:**

In [17]:
df.isnull().sum()

subreddit                    0
title                        0
clicked                      0
ups                          0
downs                        0
post_paragraph            3498
likes                     4712
number_of_comments           0
category                  4712
score                        0
author_flair_css_class    4078
subreddit_type               0
target                       0
dtype: int64

**The column 'clicked' is not empty, but the column values are purely False, therefore I will drop 'clicked' as well. The same for columns 'downs' and 'subreddit_type' which are purely 0's and 'public', respectively.**

In [18]:
df['clicked'].value_counts()

False    4712
Name: clicked, dtype: int64

In [19]:
df['downs'].value_counts()

0    4712
Name: downs, dtype: int64

In [20]:
df['subreddit_type'].value_counts()

public    4712
Name: subreddit_type, dtype: int64

In [21]:
df_drop_list = ['likes', 'category', 'clicked', 'downs', 'subreddit_type']

In [22]:
df.drop(df_drop_list, axis=1, inplace=True)

In [23]:
df.shape

(4712, 8)

In [27]:
df = df.drop_duplicates()

In [29]:
df.shape

(4710, 8)

In [30]:
df.to_csv('master_df.csv', index=False, sep=",")

# Read CSV in the model file

In [31]:
df = pd.read_csv('./master_df.csv')