## Requirements

In [77]:
from distutils.filelist import findall
import pandas as pd
from pmaw import PushshiftAPI
import datetime as dt
import os
import requests
import re
from bs4 import BeautifulSoup as bs
import spacy_udpipe

## Retreiving posts

#### Load Pushshift

In [78]:
api = PushshiftAPI()

#### Search parameters


##### Specify time frame

In [79]:
start_time = int(dt.datetime.timestamp(dt.datetime.strptime('2011-02-14 00:00:00', '%Y-%m-%d %H:%M:%S'))) #time EarthPorn was created
end_time = int(dt.datetime.timestamp(dt.datetime.strptime('2022-08-01 00:00:00', '%Y-%m-%d %H:%M:%S')))
current_time = int(dt.datetime.timestamp(dt.datetime.now()))

# Create string specifying time frame that can be used for file name when saving data as csv
search_time = '20110214-20220816' 

In [80]:
print(end_time)

1659304800


##### Specify subreddit and search limit

In [81]:
# Set subreddit and limit
subreddit = 'EarthPorn'
limit = 500

#### Query posts from pushshift using search_submissions with default parameters

Default parameters:  
max_ids_per_request = 500 (max)  
max_results_per_request = 100 (max)  
mem_safe = False -> stores responses in cache during operation if True  
safe_exit = False -> will safely exit if interupted by storing current responses and requests in the cache if True  
cache_dir -> path to cache responses in when mem_safe or safe_exit is enabled  

In [82]:
posts = api.search_submissions(subreddit=subreddit, limit=limit)
print(f'Retrieved {len(posts)} posts from Pushshift')

Retrieved 500 posts from Pushshift


#### Create data frame for posts

In [83]:
post_list = [post for post in posts]
posts_df = pd.DataFrame(post_list)

In [84]:
# Test amount of duplicates
test = posts_df["url"]
print(len(test))
print(len(test.drop_duplicates()))

500
496


In [85]:
root_folder = '/Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/'

def download(row):
   filename = root_folder + subreddit + '_' + row['id'] + '.jpg'

   # create folder if it doesn't exist
   os.makedirs(os.path.dirname(filename), exist_ok = True)

   url = row.url

   print(f"Downloading {url} to {filename}")
   r = requests.get(url, allow_redirects=True)
   with open(filename, 'wb') as f:
       f.write(r.content)

try:
    posts_df.apply(download, axis=1)
except:
    pass

Downloading https://i.redd.it/0p41a8xt1e1a1.jpg to /Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/EarthPorn_z1e7kd.jpg
Downloading https://i.redd.it/d680tr8xlf1a1.jpg to /Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/EarthPorn_z1e66r.jpg
Downloading https://i.redd.it/8ess1pn3sd1a1.jpg to /Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/EarthPorn_z1cicg.jpg
Downloading https://i.redd.it/vhq3nfw76f1a1.jpg to /Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/EarthPorn_z1bzmz.jpg
Downloading https://i.redd.it/iow0t7dikd1a1.jpg to /Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/EarthPorn_z1bfc2.jpg
Downloading https://i.redd.it/hz7nt84a1f1a1.jpg to /Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/EarthPorn_z1b8xx.jpg
Downloading https://i.redd.it/pxc939gs0f1a1.jpg to /Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/Data/EarthPorn_z1b6dg.jpg

Preview sample of posts data

In [86]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', 25)
print(posts_df)

     subreddit selftext author_fullname  gilded  \
0    EarthPorn              t2_1p6cmtae       0   
1    EarthPorn              t2_tllcjn6z       0   
2    EarthPorn              t2_21tfccf2       0   
3    EarthPorn              t2_a758y5rm       0   
4    EarthPorn               t2_ola2kwf       0   
..         ...      ...             ...     ...   
495  EarthPorn              t2_lvp4l9e3       0   
496  EarthPorn              t2_596txlac       0   
497  EarthPorn              t2_u80hogu3       0   
498  EarthPorn                t2_13na3y       0   
499  EarthPorn                 t2_mf97n       0   

                                                                                                                                                         title  \
0                                                                                        Waimea Canyon, Kauai - colorful even on a hazy day [OC] [5462 × 3660]   
1                                                              

#### Clean up data frame

##### Get list of all column names

In [87]:
posts_df.columns

Index(['subreddit', 'selftext', 'author_fullname', 'gilded', 'title',
       'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls',
       'link_flair_css_class', 'thumbnail_height', 'top_awarded_type',
       'hide_score', 'quarantine', 'link_flair_text_color', 'upvote_ratio',
       'author_flair_background_color', 'subreddit_type',
       'total_awards_received', 'media_embed', 'thumbnail_width',
       'author_flair_template_id', 'is_original_content', 'secure_media',
       'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed',
       'link_flair_text', 'score', 'is_created_from_ads_ui', 'author_premium',
       'thumbnail', 'edited', 'author_flair_css_class',
       'author_flair_richtext', 'gildings', 'post_hint', 'content_categories',
       'is_self', 'link_flair_type', 'wls', 'removed_by_category',
       'author_flair_type', 'domain', 'allow_live_comments', 'suggested_sort',
       'url_overridden_by_dest', 'view_count', 'archived', 'no_follow',


##### Remove unwanted columns

In [88]:
#posts_df.drop(columns = ['author_flair_css_class', 'author_flair_text'], inplace = True)

#posts_df.drop(columns = ['author_flair_css_class', 'author_flair_text', 'gilded', 'mod_reports', 'user_reports', 'brand_safe', 'contest_mode', 'spoiler', 'suggested_sort', 'author_flair_richtext', 'author_flair_type', 'can_mod_post', 'link_flair_richtext', 'link_flair_text_color', 'link_flair_type', 'rte_mode', 'subreddit_type', 'thumbnail_height', 'thumbnail_width', 'author_flair_background_color', 'author_flair_text_color', 'author_patreon_flair', 'gildings', 'is_robot_indexable', 'link_flair_background_color', 'send_replies', 'no_follow', 'updated_utc', 'all_awardings', 'allow_live_comments', 'author_premium', 'awarders', 'total_awards_received', 'treatment_tags', 'is_created_from_ads_ui', 'parent_whitelist_status', 'pwls', 'url_overridden_by_dest', 'whitelist_status', 'wls', 'removed_by_category', 'author_is_blocked', 'approved_at_utc', 'banned_at_utc', 'steward_reports', 'removed_by', 'poll_data', 'top_awarded_type', 'retrieved_on'], inplace = True)

##### Change column names and reorder columns

1. Create dictionary - 'old name' : 'new name'

In [89]:
column_names = {'id' : 'PostID',
                'subreddit' : 'Subreddit',
                'subreddit_id' : 'SubredditID',
                'created_utc' : 'PostTime',
                'title' : 'PostTitle',
                'author' : 'Username',
                'author_created_utc' : 'UserCreatedTime',
                'author_fullname' : 'AuthorName', 
                'domain' : 'ImageDomain',
                'full_link' : 'Link',
                'is_self' : 'IsTextPost',
                'media_embed' : 'EmbeddedMedia',
                'secure_media_embed' : 'SecureEmbeddedMedia',
                'num_comments' : 'CommentNumber', 
                'over_18' : 'NSFW',
                'permalink' : 'Permalink', 
                'score' : 'Upvotes', 
                'selftext' : 'PostText', 
                'thumbnail' : 'Thumbnail',
                'url' : 'ImageURL',
                'media' : 'Media',
                'secure_media' : 'SecureMedia',
                'stickied' : 'Stickied',
                'locked' : 'CommentsLocked',
                'post_hint' : 'PostHint',
                'preview' : 'Preview',
                'is_crosspostable' : 'IsCrosspostable',
                'is_reddit_media_domain' : 'IsRedditMediaDomain',
                'is_video' : 'IsVideo',
                'num_crossposts' : 'CrosspostsNumber', 
                'pinned' : 'Pinned',
                'crosspost_parent' : 'CrosspostParent',
                'crosspost_parent_list' : 'CrosspostParentList',
                'is_meta' : 'IsMeta',
                'is_original_content' : 'IsOriginal',
                'media_only' : 'OnlyMedia', 
                'subreddit_subscribers' : 'SubRedditSubscribers',
                'media_metadata' : 'MediaMetadata', 
                'upvote_ratio' : 'UpvoteRatio', 
                'gallery_data' : 'GalleryData', 
                'is_gallery' : 'IsGallery', 
                'author_cakeday' : 'AuthorBirthdate',
                'edited' : 'Edited', 
                'view_count' : 'ViewCount', 
                'author_id' : 'AuthorID',
                'og_description' : 'OGDescription',
                'og_title' : 'OGTitle'}

2. Rename columns using dictionary

In [92]:
posts_tidy_df = posts_df.rename(columns = column_names)
# Check to see if columns have been renamed
posts_tidy_df.columns 

Index(['Subreddit', 'PostText', 'AuthorName', 'gilded', 'PostTitle',
       'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls',
       'link_flair_css_class', 'thumbnail_height', 'top_awarded_type',
       'hide_score', 'quarantine', 'link_flair_text_color', 'UpvoteRatio',
       'author_flair_background_color', 'subreddit_type',
       'total_awards_received', 'EmbeddedMedia', 'thumbnail_width',
       'author_flair_template_id', 'IsOriginal', 'SecureMedia',
       'IsRedditMediaDomain', 'IsMeta', 'category', 'SecureEmbeddedMedia',
       'link_flair_text', 'Upvotes', 'is_created_from_ads_ui',
       'author_premium', 'Thumbnail', 'Edited', 'author_flair_css_class',
       'author_flair_richtext', 'gildings', 'PostHint', 'content_categories',
       'IsTextPost', 'link_flair_type', 'wls', 'removed_by_category',
       'author_flair_type', 'ImageDomain', 'allow_live_comments',
       'suggested_sort', 'url_overridden_by_dest', 'ViewCount', 'archived',
       'no_follow'

3. Reorder columns

In [104]:
sum(posts_tidy_df['removed_by_category'] != True)

500

In [67]:
#posts_tidy_df = posts_tidy_df[['Subreddit', 'SubredditID', 'PostTitle', 'PostID', 'PostTime', 'Username', 'Upvotes', 'CommentNumber', 'ImageDomain', 'ImageURL', 'UserCreatedTime', 'AuthorName', 'Permalink', 'Link', 'IsTextPost', 'PostText', 'EmbeddedMedia', 'Thumbnail', 'NSFW']]

#posts_tidy_df = posts_tidy_df[['Subreddit', 'SubredditID', 'PostTitle', 'PostID', 'PostTime', 'Username', 'Upvotes', 'CommentNumber', 'ImageDomain', 'ImageURL', 'AuthorName', 'Permalink', 'IsTextPost', 'PostText', 'EmbeddedMedia', 'Thumbnail', 'NSFW']]
                                       
posts_tidy_df = posts_tidy_df[['Subreddit', 'SubredditID', 'PostTitle', 'PostID', 'PostTime', 'Username', 'Upvotes', 'CommentNumber', 'ImageDomain', 'ImageURL', 'Permalink', 'IsTextPost', 'PostText', 'EmbeddedMedia', 'Thumbnail', 'NSFW']]

#posts_reordered_df = posts_renamed_df[['Subreddit', 'SubredditID', 'PostTitle', 'PostID', 'PostTime', 'Username', 'ViewCount', 'Upvotes', 'UpvoteRatio', 'CommentNumber', 'Edited', 'OGDescription', 'OGTitle', 'ImageDomain', 'ImageURL', 'Permalink', 'Link', 'IsTextPost', 'PostText', 'UserCreatedTime', 'AuthorID', 'AuthorName', 'AuthorBirthdate', 'IsVideo', 'IsMeta', 'IsOriginal', 'IsGallery', 'GalleryData', 'IsRedditMediaDomain', 'IsCrosspostable', 'CrosspostsNumber', 'CrosspostParent', 'CrosspostParentList', 'SubRedditSubscribers', 'OnlyMedia', 'MediaMetadata', 'EmbeddedMedia', 'SecureEmbeddedMedia', 'Media', 'SecureMedia', 'Thumbnail', 'Stickied', 'Pinned', 'PostHint', 'Preview', 'CommentsLocked', 'NSFW']]

Convert time stamp from UNIX to UTC

In [68]:
posts_tidy_df['PostTime'] = pd.to_datetime(posts_tidy_df['PostTime'], utc=True, unit='s')

#### Fix image URLS

In [69]:
#posts_tidy_df = posts_tidy_df.reindex(columns = posts_tidy_df.columns.tolist() + ['NewURL']) #create column for fixed urls

for index, row in posts_tidy_df.iterrows():
    if row['ImageDomain'] == 'flickr.com':
        print(row['ImageURL'])
        r = requests.get(row['ImageURL'])
        soup = bs(r.content)
        images = re.findall(r'(https:\/\/live\.staticflickr\.com\/[0-9][0-9][0-9][0-9]\/[a-zA-Z0-9_]+\.(?:png|jpg|jpeg|gif|png|svg))', str(soup))
        image_url = None
        for image in images:
            image_url = image
            break
        print(image_url)
        if image_url != None:
            posts_tidy_df.at[index, 'NewURL'] = image_url
    elif row['ImageDomain'] == 'imgur.com':
        posts_tidy_df.at[index, 'NewURL'] = re.sub(r'http://imgur.com', 'http://i.imgur.com', row['ImageURL']) + '.jpg'
    elif row['ImageDomain'] == 'i.imgur.com':
        posts_tidy_df.at[index, 'NewURL'] = row['ImageURL']
    elif row['ImageDomain'] == 'i.redd.it':
        posts_tidy_df.at[index, 'NewURL'] = row['ImageURL']
    else:
        continue

https://flickr.com/photos/atilla2008/52663768874/sizes/3k/
None


In [76]:
# Der er ikke så mange andre billeder som ikke har upvotes anderledes end 1
sum(posts_tidy_df['Upvotes'] != 1)

14

In [71]:
print("Rows before: ", len(posts_tidy_df))
posts_tidy_df = posts_tidy_df.dropna()
posts_tidy_df = posts_tidy_df.drop_duplicates()
print("Rows after: ", len(posts_tidy_df))

Rows before:  500


TypeError: unhashable type: 'dict'

## Save data frame and images

#### View data frame

In [None]:
pd.set_option('display.max_colwidth', None)
posts_tidy_df

Unnamed: 0,Subreddit,PostText,AuthorName,gilded,PostTitle,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,thumbnail_height,top_awarded_type,hide_score,quarantine,link_flair_text_color,UpvoteRatio,author_flair_background_color,subreddit_type,total_awards_received,EmbeddedMedia,thumbnail_width,author_flair_template_id,IsOriginal,SecureMedia,IsRedditMediaDomain,IsMeta,category,SecureEmbeddedMedia,link_flair_text,Upvotes,is_created_from_ads_ui,author_premium,Thumbnail,Edited,author_flair_css_class,author_flair_richtext,gildings,PostHint,content_categories,IsTextPost,link_flair_type,wls,removed_by_category,author_flair_type,ImageDomain,allow_live_comments,suggested_sort,url_overridden_by_dest,ViewCount,archived,no_follow,IsCrosspostable,Pinned,NSFW,Preview,all_awardings,awarders,OnlyMedia,can_gild,spoiler,CommentsLocked,author_flair_text,treatment_tags,removed_by,distinguished,SubredditID,link_flair_background_color,PostID,is_robot_indexable,Username,discussion_type,CommentNumber,send_replies,whitelist_status,contest_mode,author_patreon_flair,author_flair_text_color,Permalink,parent_whitelist_status,Stickied,ImageURL,SubRedditSubscribers,PostTime,CrosspostsNumber,Media,IsVideo,retrieved_utc,updated_utc,utc_datetime_str,AuthorBirthdate,NewURL


#### Save data frame as CSV

In [None]:
filename = '/Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/' + subreddit + '_' + search_time + '.csv'
posts_tidy_df.to_csv(filename, header=True, index=False, columns=list(posts_tidy_df.axes[1]))

#### Save images from URLs

In [None]:
#Save images from data frame URL column
root_folder = '/Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/'

def download(row):
   filename = root_folder + subreddit + '_' + row['PostID'] + '.jpg'

   # create folder if it doesn't exist
   os.makedirs(os.path.dirname(filename), exist_ok = True)

   url = row.NewURL

   print(f"Downloading {url} to {filename}")
   r = requests.get(url, allow_redirects=True)
   with open(filename, 'wb') as f:
       f.write(r.content)

try:
    posts_tidy_df.apply(download, axis=1)
except:
    pass

## Text cleaning and annotating features

> this might be useful later on to create a list of features mentioned in the text for each picture

#### Remove brackets and other characters

In [None]:
posts_clean_df = posts_tidy_df.rename(columns = column_names)
posts_clean_df['PostTitle'].replace(to_replace="\[(.*?)\]", value="", regex=True, inplace=True) 
posts_clean_df['PostTitle'].replace(to_replace="\(\d*?\s*[\u00D7?x?]\s*\d*?\)", value="", regex=True, inplace=True)
posts_clean_df['PostTitle'].replace(to_replace="\(", value="", regex=True, inplace=True)
posts_clean_df['PostTitle'].replace(to_replace="\)", value="", regex=True, inplace=True)
posts_clean_df['PostTitle'].replace(to_replace="-", value="", regex=True, inplace=True)

#### Load NLP model

In [None]:
spacy_udpipe.download("en")
nlp = spacy_udpipe.load("en")

Already downloaded a model for the 'en' language


##### Create new data frame for annotations

In [None]:
column_names = ['Sentence', 'Text ID', 'IDX', 'Text', 'Lemma', 'POS', 'Form', 'Dependency', 'Sentiment'] 
posts_annotated_df = pd.DataFrame(columns=column_names)

#### Create empty lists to store token values in

In [None]:
sent = []
i = []
idx = []
word = []
lemma = []
pos = []
tag = []
dep = []
sentiment = []
form = []

#### Tokenize post titles

In [None]:
for index, row in posts_clean_df.iterrows():
    text = row['PostTitle']
    doc = nlp(text)
    for token in doc:
        #print('Sentence:' + token.sent)
        sent.append(token.sent)
        i.append(token.i)
        idx.append(token.idx)
        word.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        form.append(token.morph.get("VerbForm"))
        tag.append(token.tag_)
        dep.append(token.dep_)
        sentiment.append(token.sentiment)
       

#### Add token annotations to data frame

In [None]:
posts_annotated_df['Sentence'] = sent
posts_annotated_df['Text ID'] = i
posts_annotated_df['Text'] = word
posts_annotated_df['Lemma'] = lemma
posts_annotated_df['POS'] = pos
posts_annotated_df['VerbForm'] = form
posts_annotated_df['Dependency'] = dep
posts_annotated_df['IDX'] = idx
posts_annotated_df['Sentiment'] = sentiment
posts_annotated_df['VerbForm'] = posts_annotated_df['VerbForm'].str[0]

print(posts_annotated_df)

AttributeError: Can only use .str accessor with string values!

##### Save annotations as CSV

In [None]:
filename = '/Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/' + subreddit + '_' + search_time + '.csv'
posts_annotated_df.to_csv(filename, header=True, index=False, columns=list(posts_annotated_df.axes[1]))

### Find features

#### Filter for nouns

In [None]:
nouns_df = posts_annotated_df[posts_annotated_df['POS'].str.contains("NOUN|PROPNOUN")]
features_df = nouns_df[nouns_df['Dependency'].str.contains('ROOT')]

#### Create list with features

In [None]:
features_list = features_df['Text'].tolist()

#### Create data frame with features and subreddit name

In [None]:
CES_features = pd.DataFrame()
CES_features['Features'] = features_list
CES_features['Subreddit'] = subreddit

#### Save features as CSV

In [None]:
#All_CES_features_updated.to_csv(features_filename, header=True, index=False, columns=list(All_CES_features_updated.axes[1]))

NameError: name 'All_CES_features_updated' is not defined

In [None]:
filename = '/Users/clarasofiechristiansen/Documents/Clara/DTU/Data_Fagprojekt/CES_features.csv'
CES_features.to_csv(filename, header=True, index=False, columns=list(CES_features.axes[1]))