# Import and install useful libraries

In [1]:
!pip install praw -q
!pip install psaw -q

In [2]:
import praw
import pandas as pd
from psaw import PushshiftAPI
import datetime 
from google.colab import files
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [3]:
!mkdir CSV_FILES   # folder for saving the csv
%cd /content/CSV_FILES

mkdir: cannot create directory ‘CSV_FILES’: File exists
/content/CSV_FILES


# Authentication
Follow this article https://towardsdatascience.com/scraping-reddit-data-1c0af3040768 to get your client id, secret and agent details.

In [4]:
# Authenticate
my_client_id = ''
my_client_secret = ''
my_user_agent = ''
api = PushshiftAPI()
reddit = praw.Reddit(client_id=my_client_id, client_secret=my_client_secret, user_agent=my_user_agent, check_for_async=False)

# Extract comments by URL

In [5]:
# Function to scrape data from reddit and save it as a dataframe
def reddit_scraper(urls):
  posts_url = []
  date_url = []
  score_url = []
  for url in urls:
    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
      posts_url.append(comment.body)
      date_url.append(comment.created_utc)
      score_url.append(comment.score)
  return posts_url, date_url, score_url

In [6]:
urls = [
        'https://www.reddit.com/r/asklatinamerica/comments/n4fsua/what_the_fuck_is_going_on_in_colombia/',
        'https://www.reddit.com/r/asklatinamerica/comments/nef330/colombians_and_non_colombians_are_you_aware_of/'
    
        ]

df = pd.DataFrame(reddit_scraper(urls)).T

In [7]:
df

Unnamed: 0,0,1,2
0,[deleted],1.62023e+09,7
1,"The absolute state of the answers.\n\nShort response: We were protesting a tax reform and it's getting intense. Tax reform is done at this point it's pretty clear it's gonna continue because people are DONE with this government.\n\nPolice repression, military action, estate violence, it's all in full effect. \n\nLa policía está matando civiles en Cali. Y cada día está peor.",1.62013e+09,29
2,Nos emberracamos,1.62014e+09,3
3,"Is there a good online source to follow this situation? Also, is it just Cali or other cities as well?",1.62014e+09,2
4,Fucking Duque,1.62014e+09,3
...,...,...,...
93,If you generate billions of dollars but it all pools at the top then you still haven't solved poverty. Reducing inequality is very effective at bringing people out of poverty.,1.62127e+09,1
94,New Zealand doesn't border the world's largest drug consumer who also sells weapons to the cartels. Reducing the government's size and power in countries like Mexico or Colombia would be insane for that same reason. Cartels would just step up and assume full political power.,1.62134e+09,0
95,"Latin America is not producing ""billions of dollars"", if we reduce ineaquality to the GINI point of 0 (every person receives the same income), everybody would still be as poor as the lower classes of Europe. Iraq is two times more equal than Chile.",1.62132e+09,1
96,"I'm saying if you were to create all that money, it is completely useless if it doesn't reach 99% of the population. Which it won't, because trickle-down economics are bullshit.\n\nAnd there are plenty of Latin American billionaires.",1.62134e+09,1


# Extract comments by keywords

In [8]:
# Function to get the comments from subreddits results

start_time = int(datetime.datetime(2021,4,1).timestamp())
end_time = int(datetime.datetime(2021,8,29).timestamp())


def get_comments(subreddit, keyword):
  posts = []
  date = []
  score = []

  # Get the URLs
  submissions = list(api.search_submissions(after= start_time, before=end_time,
                             subreddit = subreddit,
                             q = keyword,
                             filter = ['full_link', 'num_comments'],
                             limit = 100))
  sub = pd.DataFrame(submissions)
  comments = sub[sub['num_comments']>0] #only posts with comments
  comments.astype('str')
  urls = comments['full_link'].tolist()
 
# Get the comments
  for url in urls:
    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
      posts.append(comment.body)
      date.append(comment.created_utc) 
      score.append(comment.score)
  return posts, date, score


In [9]:
# Creating dataframes for differents subreddits and keywords
df1 = pd.DataFrame(get_comments('Colombia', 'paro')).transpose()   
df2 = pd.DataFrame(get_comments('Colombia', 'protesta')).transpose()  
df3 = pd.DataFrame(get_comments('Colombia', 'strike')).transpose()  
df4 = pd.DataFrame(get_comments('Colombia', 'ddhh')).transpose()   

In [10]:
# Data as a single dataframe
reddit_data = pd.concat([df, df1, df2, df3, df4], ignore_index=True)
reddit_data.rename(columns={0:'comment', 1:'date', 2:'score'}, inplace=True)
reddit_data['date'] = pd.to_datetime(reddit_data['date'],unit='s')
reddit_data.sort_values(by=['date'], inplace=True)
reddit_data

Unnamed: 0,comment,date,score
1639,[removed],2021-04-16 03:59:28,1
1641,Su post o comentario ha sido removido por violar la regla de auto-promoción o spam del sub.\n\nYour post or comment has been removed due to its violation of the self-promotion or spam rule of the sub.,2021-04-16 07:39:42,1
1640,El pan nuestro de cada día.,2021-04-16 15:00:21,1
1585,"Asista bajo su propio riesgo, si finalmente va a asistir, lleve un morral con agua, mascarillas adicionales, alcohol en gel y pañitos húmedos, una gorra, use zapatos cómodos. No lleve billetera, solo vaya con un documento de identidad y solo dinero en efectivo. Tampoco lleve celular de alta gama, las situaciones se salen muy fácil de control y si puede ir acompañado y avisando a algún familiar o amigo en que parte se encuentra y establezca un sitio de recogida en caso que la situación no de más.\n\nSi la situación se sale de control, retírese del lugar, en las manifestaciones violentas los que pagan el ""pato"" son los menos experimentados o los curiosos.",2021-04-23 19:39:04,1
1586,Watasandia,2021-04-24 04:11:21,1
...,...,...,...
107,"Por que proponen soluciones peores que los problemas actuales. \n\n""R3nt4 b4s1c4 y4 p4rc3r0s"" LOL",2021-08-26 17:13:16,-8
104,Yo también quisiera ir el mes de septiembre pero esta en el nivel 4 en el Travel Advisory. Dice que hay terrorismo.,2021-08-26 21:16:19,0
101,"No existe ""ilegalidad"" en el movimiento, el problema es cuando aparece la policía, usted puede unirse a la marcha sin problema, puede caminar por el anden al lado sin mezclarse con los marchantes, pero el problema es cuando aparece la policia, los negocios en general no cierran a menos que existan disturbios, por otro lado si necesita moverse es mejor evitar las vias donde hay marchas para no perder tiempo cruzando esas vias.",2021-08-26 23:26:52,2
103,"Por 4 pelagatos que van no va a pasar nada, es pura bulla, ya nadie apoya a esa gente que no estudia ni trabaja, solo hacen ""paro nacional"" cada 8 días",2021-08-27 04:15:48,1


In [11]:
# Save as csv
reddit_data.to_csv('reddit_data.csv')