# Scraping Reddit

### Imports

In [None]:
import praw
import pandas as pd
import datetime
from praw.models import MoreComments
import nltk
import nltk
nltk.download('vader_lexicon')
import nltk.sentiment

### Reddit API authentication:

In [None]:
reddit = praw.Reddit(client_id='BxrF2g9d3egQ4g', \
                     client_secret='T-pkCmWCBxLjVlNZk88OjvcH7sk', \
                     user_agent='SDS2019')

### Download comments for movies on reddit with timestamp

In [None]:
# Function for getting Reddit comments for movie title
def get_reddit_comments(movie_title):
    # Get 'movies subreddit'
    subreddit = reddit.subreddit('movies')
    
    # Prepare lists for information
    comments = []
    subid = []
    treeid = []
    date = []
    
    # Search Reddit
    searchresults = subreddit.search(movie_title, sort = 'relevance')
    
    # Store all commens
    i = 0
    for subidx, submission in enumerate(searchresults):
        
        # Stop after 5 submissions
        if i == 5:
            break
        i += 1
        
        # Try to get more comments
        try:
            comment_tree = submission.comments.list()
            for treeidx, top_lvl_comment in enumerate(comment_tree):
                
                if isinstance(top_lvl_comment, MoreComments):
                    continue
                
                # Append information to dataframe
                subid.append(subidx)
                comments.append(top_lvl_comment.body)
                treeid.append(treeidx)
                date.append(top_lvl_comment.created_utc)
                
        except:
            pass
        
    # Store information in dataframe
    df = pd.DataFrame({'submission': subid, 'comment_tree': treeid, 'comment': comments, 'date': date, 'movie': movie_title})
    df['date'] = pd.to_datetime(df['date'], unit = 's')
    df = df.set_index(['date'])
    df = df.sort_index()
    
    # Next iteration
    i += 1
    
    # Return dataframe of comments
    return df

### Linguistic analysis of commments:

In [None]:
# Function for getting sentiment scores of comments
def sentiment(movie_df):
    vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()
    vader_df = pd.DataFrame(list(movie_df['comment'].apply(vader.polarity_scores)))
    vader_mean = vader_df['compound'].mean()
    return vader_mean, len(vader_df)

### Grabbing movies and release dates from movie dataframe

In [None]:
# Get dataframe with movies
df = pd.read_pickle('full.pkl')

# Select on years from 2010 and forward
df = df[df['year'] >= 2010]

# Convert dataframe into lists
movie_list = df['title'].tolist()
date_list = df['releaseDate'].tolist()

### Getting sentiment scores for all movies

In [None]:
# Write list of positive words
words = ['Oscar', 'Oscars', 'Academy Award', 'Academy', 'Best Picture', 'Best Director', 'Best Actor', 'Best Actress', \
        'Best Supporting Actor', 'Best Supporting Actress', 'Best Original Screenplay', 'Best Adapted Screenplay', \
        'Best Animated Feature Film', 'Best Foreign Langauge Film', 'Golden Globe']

# Prepare dataframe for filling
sentiments_before = []
sentiments_after = []
num_comments_before = []
num_comments_after = []
positive_words_before = []
positive_words_after = []

# Loop through all movies and get sentiment scores
i = 1
for movie, date in zip(movie_list, date_list):
    # Try to get the comments for the movie
    try:
        comments = get_reddit_comments(movie, testing = True)
    except:
        sentiments_before.append(None)
        sentiments_after.append(None)
        num_comments_before.append(None)
        num_comments_after.append(None)
        continue
    
    # Try to get comments before release dte
    try:
        comments_before = comments[:date - datetime.timedelta(days = 1)]
    except:
        pass
    
    # Try to get comments after release date
    try:
        comments_after = comments[date:date + datetime.timedelta(days = 60)]
    except:
        pass
    
    # Count positive words before release
    positive_count_before = 0
    try:
        for comment in comments_before['comment']:
            for word in words:
                if word.lower() in comment.lower():
                    positive_count_before += 1
        positive_words_before.append(positive_count_before)
    except:
        positive_words_before.append(None)
    
    # Count positive words after release
    positive_count_after = 0
    try:
        for comment in comments_after['comment']:
            for word in words:
                if word.lower() in comment.lower():
                    positive_count_after += 1
        positive_words_after.append(positive_count_after)
    except:
        positive_words_after.append(None)
    
    # Try to append comments dataframes
    if i == 1:
        try:
            all_comments_before = comments_before
        except:
            pass
        
        try:
            all_comments_after = comments_after
        except:
            pass
    else:
        try:
            all_comments_before = all_comments_before.append(comments_before)
        except:
            pass
        
        try:
            all_comments_after = all_comments_after.append(comments_after)
        except:
            pass
    
    # Try to append sentiments before to list
    try:
        snt_before = sentiment(comments_before)
    except:
        pass
    
    # Try to append sentiments after to list
    try:
        snt_after = sentiment(comments_after)
    except:
        pass
    
    # Try to append sentiments before to list
    try:
        sentiments_before.append(snt_before[0])
    except:
        sentiments_before.append(None)
    
    # Try to append sentiments after to list
    try:
        sentiments_after.append(snt_after[0])
    except:
        sentiments_after.append(None)
        
    # Try to append comments before to list
    try:
        num_comments_before.append(snt_before[1])
    except:
        num_comments_before.append(None)
    
    # Try to append comments after to list
    try:
        num_comments_after.append(snt_after[1])
    except:
        num_comments_after.append(None)
        
    print(i)
    i += 1

# Store lists in dataframe
df['sentimentBefore'] = sentiments_before
df['sentimentAfter'] = sentiments_after
df['numCommentsBefore'] = num_comments_before
df['numCommentsAfter'] = num_comments_after
df['positiveWordsBefore'] = positive_words_before
df['positiveWordsAfter'] = positive_words_after

# Save dataframes
df.to_pickle('full_w_sentiment.pkl')
all_comments_before.to_pickle('all_comments_before.pkl')
all_comments_after.to_pickle('all_comments_after.pkl')