In [25]:
import requests
import re
import json
import time
import datetime
import sys
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
maps_key = os.getenv('MAP_API')

In [79]:
q1 = 'https://api.pushshift.io/reddit/comment/search/?sort=desc&sort_type=created_utc&subreddit=u_Columbia1938'

# Loading in pre-saved Columbia reddit posts
with open('ex.json', 'r') as f:
    ordered = json.load(f)

# Loading in pre-built encoding schema
with open('encoding.json', 'r') as f:
    encoding = json.load(f)

# Gets 25 reddit posts from the Columbia subreddit before a given date
def get_25(end_date):
    print(end_date)
    url = q1 + '&before=' + str(end_date)
    print(url)
    resp = requests.get(url)
    if resp.status_code == 200:
        return resp.json()

# Runs a location guess through the Google places API to get a lat and long
def ask_google(df):
    lats = []
    lons = []
    names = []
    for ind in range(len(df)):
        loc = df.iloc[ind]['post_body_utc']
        if loc != '[deleted]':

            foc = requests.get(maps_url.format(re.sub(' ', '%20', loc), maps_key))

            resp = foc.json()

            if resp['candidates'] != []:
                lat = resp['candidates'][0]['geometry']['location']['lat']
                lon = resp['candidates'][0]['geometry']['location']['lng']
                name = resp['candidates'][0]['formatted_address'].lower()

                lats.append(lat)
                lons.append(lon)
                names.append(name)
            else:
                lats.append('to_fix')
                lons.append('to_fix')
                names.append('to_fix')
        else:
            lats.append('to_delete')
            lons.append('to_delete')
            names.append('to_fix')
    return lats, lons, names


second = {}

In [27]:
conv1 = 't3_qfiysg'
conv2 = 't3_py79a5'

In [28]:
start = time.time()

# flattening nested dicts
flattened_1 = []
flattened_2 = []
for i in ordered:
    for j in ordered[i]:

        if j['link_id'] == conv1:
            flattened_1.append(j)
        elif j['link_id'] == conv2:
            flattened_2.append(j)
        
# getting all reddit-user location guesses with company replies first 'game' from pushshift
pairs_1 = []
for i in range(len(flattened_1)):
    res = flattened_1[i]
    orig = 'https://api.pushshift.io/reddit/comment/search/?link_id={}&ids={}'.format(conv1, res['parent_id'])
    check = requests.get(orig)
    if check.status_code == 200:
        pairs_1.append([res, check.json()])
    if i % 50 == 0 and i != 0:
        print('Completed {} checks of {} for conversation 1'.format(i, len(flattened_1)))
        
        
# getting all reddit-user location guesses with company replies for second 'game' from pushshift
pairs_2 = []
for i in range(len(flattened_2)):
    res = flattened_2[i]
    orig = 'https://api.pushshift.io/reddit/comment/search/?link_id={}&ids={}'.format(conv2, res['parent_id'])
    check = requests.get(orig)
    if check.status_code == 200:
        pairs_2.append([res, check.json()])
    if i % 50 == 0 and i != 0:
        print('Completed {} checks of {} for conversation 2'.format(i, len(flattened_2)))
        

# only grabbing reddit posts with Columbia responses
full_pairs_1 = [i for i in pairs_1 if i[1] != {'data': []}]
full_pairs_2 = [i for i in pairs_2 if i[1] != {'data': []}]

print('conversation 1 pairs:', len(full_pairs_1), '\n conversation 2 pairs:', len(full_pairs_2), '\n', 'taking ', time.time() - start, 'seconds')


# selecting useful data from pushshift api response json objects (reducing feature count of future df)
reduced_pairs_1 = []
reduced_pairs_2 = []

for i in full_pairs_1:
    row = {
        "post_time_utc":i[1]['data'][0]['created_utc'],
        "post_location_utc":i[1]['data'][0]['body'],
        "post_body_utc":i[1]['data'][0]['body'],
        "response_body":i[0]['body'],
        "response_time":i[0]['created_utc']
    }
    reduced_pairs_1.append(row)
    
for i in full_pairs_2:
    row = {
        "post_time_utc":i[1]['data'][0]['created_utc'],
        "post_location_utc":i[1]['data'][0]['body'],
        "post_body_utc":i[1]['data'][0]['body'],
        "response_body":i[0]['body'],
        "response_time":i[0]['created_utc']
    }
    reduced_pairs_2.append(row)

Completed 50 checks of 193 for conversation 1
Completed 100 checks of 193 for conversation 1
Completed 150 checks of 193 for conversation 1
Completed 50 checks of 63 for conversation 2
conversation 1 pairs: 189 
 conversation 2 pairs: 62 
 taking  417.98276591300964 seconds


In [80]:
# Turning reduced pairs of reddit posts and Columbia responses into dataframes
pairs_1 = pd.DataFrame(reduced_pairs_1)
pairs_2 = pd.DataFrame(reduced_pairs_2)

# Adding columns to specify which reddit posts these comments pertain to, post 1 or post 2.
pairs_1['root_post'] = ['post_1'] * len(pairs_1)
pairs_2['root_post'] = ['post_2'] * len(pairs_2)

# Concatenating the two dataframes now that we can undo it.
combined_heavy = pd.concat([pairs_1, pairs_2])

# Encoding the reddit users guesses of the picture's location.
combined_heavy['encoded'] = combined_heavy['response_body'].apply(lambda x: encoding[x])

# Converting the time of the user post from utc time to a datetime object.
combined_heavy['post_time'] = combined_heavy['post_time_utc'].apply(lambda x:datetime.datetime.fromtimestamp(x))

# Renaming the response_time column.  It was incorrectly named.
combined_heavy['response_time_utc'] = combined_heavy['response_time']

# Converting the response_time_utc from utc time to a datetime object.
combined_heavy['response_time'] = combined_heavy['response_time_utc'].apply(lambda x:datetime.datetime.fromtimestamp(x))

# Ensuring no rows were lost
assert len(combined_heavy) == (len(pairs_1) + len(pairs_2))

# Removing partially implemented vector encodings
combined_heavy['encoded_clean'] = combined_heavy['encoded'].apply(lambda x: re.sub('[^0-9+-]', '', x))

# Define Google Places API url
maps_url = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input={}&inputtype=textquery&fields=formatted_address%2Cname%2Cgeometry&key={}'

# Applying Google Places API to each reddit user guess to obtain a latitude and longitude
lats, lons, names = ask_google(combined_heavy)

# Adding in new lat and lon columns
combined_heavy['lat'] = lats
combined_heavy['lon'] = lons
combined_heavy['name'] = names

# dropping duplicate and misnamed column
combined_heavy.drop(columns=['post_location_utc', 'encoded'], inplace=True)

# Renaming misnamaed column
cols_old = list(combined_heavy.columns)
cols = [cols_old[0]] + ['post_body'] + cols_old[2:]

cols = {cols_old[i]:cols[i] for i in range(len(cols))}

combined_heavy.rename(columns=cols, inplace=True)

In [81]:
print(combined_heavy.shape)
combined_heavy.head()

(251, 11)


Unnamed: 0,post_time_utc,post_body,response_body,response_time,root_post,post_time,response_time_utc,encoded_clean,lat,lon,name
0,1635433244,Hell's canyon?,Warm!,2021-10-28 16:04:51,post_1,2021-10-28 08:00:44,1635462291,1,34.918075,-112.278501,"hell canyon, arizona, usa"
1,1635433325,[deleted],Getting colder!,2021-10-28 16:04:40,post_1,2021-10-28 08:02:05,1635462280,-1,to_delete,to_delete,to_fix
2,1635443314,Maine,Cold!,2021-10-28 16:03:17,post_1,2021-10-28 10:48:34,1635462197,-1,45.253783,-69.445469,"maine, usa"
3,1635452573,"Sterling, CO",This guess is a little bit chilly,2021-10-28 16:02:08,post_1,2021-10-28 13:22:53,1635462128,-1,40.625541,-103.207709,"sterling, co 80751, usa"
4,1635450547,"Joshua Tree, California",Cold!,2021-10-28 16:01:29,post_1,2021-10-28 12:49:07,1635462089,-1,34.134728,-116.313066,"joshua tree, ca 92252, usa"


In [82]:
combined_heavy.to_csv('reddit_data.csv', index=False)