# Project 3 - Web APIs and Natural Language Processing

## Using Web APIs to extract data from Reddit 

In [1]:
# Importing the libraries that I need. 

import requests
import pandas as pd

#### Getting 10,000 submissions from Career Guidance subredit

In [2]:
# Getting the basic url for Reddits API submissions
url = 'https://api.pushshift.io/reddit/search/submission'

In [93]:
# Setting parameters. The 'before' parameter was changed mannually 10 times. 

params = {'subreddit':'careerguidance', 
         'size': 1000,
         'before': 1574078453}

In [94]:
# Requesting content from url
response_career = requests.get(url,params)

In [95]:
# Checking the status of the request. All good.
response_career.status_code

200

In [96]:
# Looking at the text 
career_data = response_career.json()

In [97]:
posts = career_data['data']

In [98]:
# Creating a data frame out of the information I have 
career_data_df10 = pd.DataFrame(posts)

In [99]:
# Making sure I have 1000 values 
career_data_df10.shape

(1000, 66)

In [100]:
# Checking the time of creation of the last post to get the information to fill in the before parameter above.
career_data_df10['created_utc'].tail()

995    1573279004
996    1573278815
997    1573277132
998    1573276623
999    1573275878
Name: created_utc, dtype: int64

In [101]:
# Making a dataframe of the values I found 
career10= pd.DataFrame(career_data_df10)

In [102]:
# Creating a list of dataframes
career_df = [career1, career2, career3, career4, career5, career6, career7, career8, career9, career10]

In [103]:
# Concatenating them all into one dataframe 
career_complete = pd.concat(career_df) 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [104]:
# Checking I actually created a dataframe 
type(career_complete)

pandas.core.frame.DataFrame

In [105]:
# Checking the size of the data frame 
career_complete.shape

(10000, 70)

In [106]:
# Getting a sense of what the columns have to select those that I can use: 
career_complete.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,subreddit_subscribers,subreddit_type,thumbnail,thumbnail_height,thumbnail_width,title,total_awards_received,url,whitelist_status,wls
0,[],False,KAMI_aka,,,,[],,,,...,128151,public,self,,,Can I pursue a master's in engineering managem...,0,https://www.reddit.com/r/careerguidance/commen...,all_ads,6
1,[],False,LostAMO,,,,[],,,,...,128147,public,self,,,Need advice on career change and from friends ...,0,https://www.reddit.com/r/careerguidance/commen...,all_ads,6
2,[],False,PMMeYourMortys,,,,[],,,,...,128142,public,self,,,Burnout: What freelance jobs can I do if I qui...,0,https://www.reddit.com/r/careerguidance/commen...,all_ads,6
3,[],False,NotJobObsessed,,,,[],,,,...,128140,public,self,,,Do I lack work ethic or am I being gaslighted?,0,https://www.reddit.com/r/careerguidance/commen...,all_ads,6
4,[],False,NoxiousToxic,,,,[],,,,...,128135,public,self,,,I was curious: Can you exchange pay for a plac...,0,https://www.reddit.com/r/careerguidance/commen...,all_ads,6


In [107]:
# I changed the column numbers to explore all. Then noted which columns had useful information that I should keep
career_complete.iloc[1:5, 59:66]

Unnamed: 0,subreddit_id,subreddit_subscribers,subreddit_type,thumbnail,thumbnail_height,thumbnail_width,title
1,t5_2t9i0,128147,public,self,,,Need advice on career change and from friends ...
2,t5_2t9i0,128142,public,self,,,Burnout: What freelance jobs can I do if I qui...
3,t5_2t9i0,128140,public,self,,,Do I lack work ethic or am I being gaslighted?
4,t5_2t9i0,128135,public,self,,,I was curious: Can you exchange pay for a plac...


In [108]:
# After looking at the 66 variables I chose 15 for further exploration . First, I'll get a list of all the columns
career_complete.columns

Index(['all_awardings', 'allow_live_comments', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'banned_by', 'can_mod_post', 'contest_mode', 'created_utc',
       'crosspost_parent', 'crosspost_parent_list', 'domain', 'edited',
       'full_link', 'gildings', 'id', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts',
       'over_18', 'parent_whitelist_status', 'permalink', 'pinned',
       'post_hint', 'preview', 'pwls', 'removed_by', 'removed_by_category',


In [109]:
# Creating a subset of the columns I am interested in 
career_complete_selfeat = career_complete[['author','created_utc', 'full_link', 'is_video', 'media_only', 'num_comments', 
                                           'num_crossposts', 'over_18', 'score', 'selftext', 'subreddit', 'title', 'url', 
                                           'crosspost_parent', 'crosspost_parent_list']]

In [209]:
# Saving this subreddit selection as a csv
career_complete_selfeat.to_csv('./career_df.csv', index=False)

#### Getting 10,000 submissions from Higher Education subredit

In [191]:
params2 = {'subreddit':'highereducation', 
         'size': 1000, 
          'before': 1452881241}

In [192]:
# Requesting content from url
response_highered = requests.get(url,params2)

In [193]:
# Checking the status of the request. All good.
response_highered.status_code

200

In [194]:
# Looking at the text 
highered_data = response_highered.json()

In [195]:
postshe = highered_data['data']

In [196]:
# Converting it into a data frame 
highered_data_df10 = pd.DataFrame(postshe)

In [197]:
# Making sure I have 1000 values 
highered_data_df10.shape

(1000, 32)

In [198]:
# Checking the time of creation of the last post to get the information to fill in the before parameter above. 
highered_data_df10['created_utc'].tail()

995    1441202756
996    1441198833
997    1441192885
998    1441192140
999    1441184502
Name: created_utc, dtype: int64

In [199]:
# Making a dataframe of the values I found
highered10 = pd.DataFrame(highered_data_df10)

In [202]:
# Bringing all the dataframes together in a list to then concatenate 
highered_list = [highered1, highered2, highered3, highered4, highered5, highered6, highered7, highered8, highered9, highered10]

In [203]:
highered_complete = pd.concat(highered_list)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [204]:
# Checking that I actually created a dataframe
type(highered_complete)

pandas.core.frame.DataFrame

In [205]:
# Checking the size of this dataframe 
highered_complete.shape

(10000, 89)

In [207]:
# Looking at the columns in this data set. It does have more than the career guidance dataset, but they 
# won't be used for comparisson. So I am ignoring them. 
highered_complete.columns

Index(['all_awardings', 'allow_live_comments', 'approved_at_utc', 'author',
       'author_cakeday', 'author_created_utc', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'author_fullname',
       'author_id', 'author_patreon_flair', 'author_premium', 'awarders',
       'banned_at_utc', 'banned_by', 'brand_safe', 'can_mod_post',
       'contest_mode', 'created_utc', 'crosspost_parent',
       'crosspost_parent_list', 'distinguished', 'domain', 'edited',
       'full_link', 'gilded', 'gildings', 'id', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media',
       'media_embed', 'media_metadata', 'media_only', 'no_foll

In [208]:
# Selecting the columns I need
highered_complete_selfeat = highered_complete[['author','created_utc', 'full_link', 'is_video', 'media_only', 'num_comments', 
                                           'num_crossposts', 'over_18', 'score', 'selftext', 'subreddit', 'title', 'url', 
                                           'crosspost_parent', 'crosspost_parent_list']]

In [210]:
# Saving this subreddit selection as a csv
highered_complete_selfeat.to_csv('./highered_df.csv', index=False)

#### Concatenating the two subreddits for cleaning 

In [213]:
# Concatenaging both dataframes into one 
two_subreddits = pd.concat([career_complete_selfeat, highered_complete_selfeat])

In [214]:
# Making sure I did what I wanted 
two_subreddits.head()

Unnamed: 0,author,created_utc,full_link,is_video,media_only,num_comments,num_crossposts,over_18,score,selftext,subreddit,title,url,crosspost_parent,crosspost_parent_list
0,KAMI_aka,1580305052,https://www.reddit.com/r/careerguidance/commen...,False,False,0,0.0,False,1,Im in my final year of my undergraduate degree...,careerguidance,Can I pursue a master's in engineering managem...,https://www.reddit.com/r/careerguidance/commen...,,
1,LostAMO,1580304222,https://www.reddit.com/r/careerguidance/commen...,False,False,2,0.0,False,1,[removed],careerguidance,Need advice on career change and from friends ...,https://www.reddit.com/r/careerguidance/commen...,,
2,PMMeYourMortys,1580302245,https://www.reddit.com/r/careerguidance/commen...,False,False,1,0.0,False,1,I’m utterly burning out. Every day for the pas...,careerguidance,Burnout: What freelance jobs can I do if I qui...,https://www.reddit.com/r/careerguidance/commen...,,
3,NotJobObsessed,1580301838,https://www.reddit.com/r/careerguidance/commen...,False,False,2,0.0,False,1,"Sometime ago, we moved from the north east to ...",careerguidance,Do I lack work ethic or am I being gaslighted?,https://www.reddit.com/r/careerguidance/commen...,,
4,NoxiousToxic,1580300094,https://www.reddit.com/r/careerguidance/commen...,False,False,1,0.0,False,1,"If this isn’t the place to ask, I will thank t...",careerguidance,I was curious: Can you exchange pay for a plac...,https://www.reddit.com/r/careerguidance/commen...,,


In [215]:
two_subreddits.tail()

Unnamed: 0,author,created_utc,full_link,is_video,media_only,num_comments,num_crossposts,over_18,score,selftext,subreddit,title,url,crosspost_parent,crosspost_parent_list
995,ESB605,1441202756,https://www.reddit.com/r/highereducation/comme...,,,0,,False,3,,highereducation,"Survey Examines Cooperation Between Faculty, L...",https://www.insidehighered.com/quicktakes/2015...,,
996,percytrappe,1441198833,https://www.reddit.com/r/highereducation/comme...,,,1,,False,3,,highereducation,University Humor – Erskine Bowles,https://academicanchor.wordpress.com/2012/12/0...,,
997,rellotscire,1441192885,https://www.reddit.com/r/highereducation/comme...,,,0,,False,3,,highereducation,Are we nearing the end of college tuition pric...,http://www.washingtonpost.com/news/grade-point...,,
998,rellotscire,1441192140,https://www.reddit.com/r/highereducation/comme...,,,2,,False,9,,highereducation,Why Students With Smallest Debts Have the Larg...,http://www.nytimes.com/2015/09/01/upshot/why-s...,,
999,rellotscire,1441184502,https://www.reddit.com/r/highereducation/comme...,,,1,,False,11,,highereducation,Daily Marijuana Use Among College Students Hig...,http://detroit.cbslocal.com/2015/09/01/higher-...,,


In [216]:
# Making sure that I have 20000 rows
two_subreddits.shape

(20000, 15)

In [218]:
# Turning this dataframe into a cvs 
two_subreddits.to_csv('./two_subreddits.csv', index=False)