<a href="https://colab.research.google.com/github/NikolaiMiranda/reddit-sentiment-analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
# Import Packages
import asyncpraw
import asyncio
import pandas as pd
import datetime
import time
import logging
import csv
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [53]:
# Reddit API Credentials
from google.colab import userdata

reddit = asyncpraw.Reddit(
    client_id = userdata.get('clientID'),
    client_secret = userdata.get('clientSecret'),
    user_agent = "SentimentAnalyzerColab1.0"
)

In [54]:
# Input the subreddit you want to analyze
subreddit_name = input("Enter a subreddit name: ")

# Get the subreddit
subreddit = await reddit.subreddit(subreddit_name)

Enter a subreddit name: biltrewards


In [55]:
# Select the dates you want to analyze data for
# Loop to get valid date input
while True:
    start_date_str = input("Enter start date (MM-DD-YYYY): ")
    end_date_str = input("Enter end date (MM-DD-YYYY): ")

    # Error checking for timestamps
    try:
        start_datetime = datetime.datetime.strptime(start_date_str, "%m-%d-%Y")
        end_datetime = datetime.datetime.strptime(end_date_str, "%m-%d-%Y").replace(hour=23, minute=59, second=59)

        if end_datetime < start_datetime:
            print("End date must be after start date. Please try again.")
            continue # Ask for dates again if end date is before start date

        start_timestamp = start_datetime.timestamp()
        end_timestamp = end_datetime.timestamp()
        break # Exit loop if dates are valid and in correct order

    except ValueError:
        print("Invalid date format. Please use MM-DD-YYYY.")

Enter start date (MM-DD-YYYY): 07-20-2025
Enter end date (MM-DD-YYYY): 07-30-2025


In [56]:
# Initialize an empty list to store the data
data = []

# Begin fetching posts
async for submission in subreddit.new(limit=None): # Fetching all posts (no limit)

  try:
    await submission.load()
  except asyncprawcore.exceptions.NotFound:
    print(f"Submission {submission.id} not found (likely deleted). Skipping.")
    continue # Skip to the next submission
  except Exception as e:
    print(f"Error loading submission {submission.id}: {e}. Skipping this post.")
    continue # Skip to the next submission

  created_utc_ts = submission.created_utc
  if created_utc_ts < start_timestamp:
    break

  if start_timestamp <= created_utc_ts <= end_timestamp:
    submission_data = {
      'type': 'post',
      'id': submission.id,
      'title': submission.title,
      'text': submission.selftext,
      'score': submission.score,
      'num_comments': submission.num_comments,
      'created_utc': datetime.datetime.fromtimestamp(created_utc_ts),
      'url': submission.url,
      'author': submission.author.name if submission.author else '[deleted]',
      'parent_id': None
    }
    data.append(submission_data)

  # Fetch all comments for the post
  try:
    await submission.comments.replace_more(limit=None) # Fetching all comments (no limit)
    all_comments = await submission.comments.list()
  except Exception as e:
    print(f"Could not fetch comments for post {submission.id}: {e}")
    all_comments = []

  for comment in all_comments:
    # Make sure we have an actual comment and not a MoreComments object
    if not isinstance(comment, asyncpraw.models.MoreComments):
      comment_created_utc_ts = comment.created_utc
      # Only include comments within the date range as well
      if start_timestamp <= comment_created_utc_ts <= end_timestamp:
        comment_data = {
          'type': 'comment',
          'id': comment.id,
          'title': submission.title,
          'text': comment.body,
          'score': comment.score,
          'num_comments': 0,
          'created_utc': datetime.datetime.fromtimestamp(comment_created_utc_ts),
          'url': f"https://www.reddit.com{comment.permalink}",
          'author': comment.author.name if comment.author else '[deleted]',
          'parent_id': comment.parent_id
        }
        data.append(comment_data)

await reddit.close()

df = pd.DataFrame(data)

print(f"\nCollected {len(df[df['type'] == 'post'])} posts and {len(df[df['type'] == 'comment'])} comments within the specified date range.")


Collected 89 posts and 791 comments within the specified date range.


In [24]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment scores for a given text
def get_sentiment_scores(text):
    if not isinstance(text, str):
        # Handle non-string values (e.g., NaN, None) by returning neutral scores
        return {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
    return analyzer.polarity_scores(text)

# Apply the function to your 'Content Text' column
# This will create a new column 'sentiment_scores' containing dictionaries
df['sentiment_scores'] = df['text'].apply(get_sentiment_scores)

# Extract individual scores into separate columns for easier use
df['Negative Sentiment Score'] = df['sentiment_scores'].apply(lambda x: x['neg'])
df['Neutral Sentiment Score'] = df['sentiment_scores'].apply(lambda x: x['neu'])
df['Positive Sentiment Score'] = df['sentiment_scores'].apply(lambda x: x['pos'])
df['Compound Sentiment Score'] = df['sentiment_scores'].apply(lambda x: x['compound']) # This is the main aggregated score

# Drop the intermediate 'sentiment_scores' column
df = df.drop(columns=['sentiment_scores'])

# Categorize the compound score
def categorize_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Create a sentiment category column
df['Sentiment Category'] = df['Compound Sentiment Score'].apply(categorize_sentiment)

print("Sentiment analysis complete! New sentiment columns added to DataFrame.")
print(df[['title', 'text', 'Compound Sentiment Score', 'Sentiment Category']].head())

Sentiment analysis complete! New sentiment columns added to DataFrame.
                                               title  \
0                               Partial Rent Payment   
1   New to BILT and confused (currently also moving)   
2              how to maximize bilt for rent rewards   
3  Bilt Travel Portal Down for Bookings prior to ...   
4                              New Bilt Cards coming   

                                                text  \
0  Hello,\n\nI've come to an agreement (documente...   
1  Hi, I've been interested in BILT for a while n...   
2  hi,\n\ni live in a co-op and the management co...   
3  Not sure why this portal is down...but can't a...   
4  I got this in my newsfeed today about the new ...   

   Compound Sentiment Score Sentiment Category  
0                   -0.5650           Negative  
1                    0.8689           Positive  
2                    0.7971           Positive  
3                   -0.5183           Negative  
4         

In [None]:
# Export dataframe as a CSV file

# Handle potential missing text content
df['text'] = df['text'].fillna('')

# Define the exact columns we want to include in our CSV.
selected_columns_original_names = [
    'id', 'type', 'title', 'text', 'score',
    'num_comments', 'created_utc', 'url', 'author', 'parent_id',
    'Negative Sentiment Score', 'Neutral Sentiment Score', 'Positive Sentiment Score',
    'Compound Sentiment Score', 'Sentiment Category'
]

# Create the DataFrame that will be exported to CSV, containing only selected columns
# Use .copy() is to avoid SettingWithCopyWarning later
try:
    export_df = df[selected_columns_original_names].copy()
except KeyError as e:
    print(f"Error: One of the selected columns does not exist in your DataFrame: {e}")
    print("Please check `df.columns.tolist()` above and adjust `selected_columns_original_names` list.")
    raise e

# Rename columns for clarity
rename_map = {
    'created_utc': 'Date Created UTC',
    'score': 'Content Score (Upvotes/Downvotes)',
    'num_comments': 'Number of Comments (for Posts)',
    'author': 'Author',
    'title': 'Post Title',
    'text': 'Content Text',
    'type': 'Content Type',
    'url': 'Content URL',
    'parent_id': 'Parent ID',
    'Month Name': 'Month'
}

# Apply renaming
export_df.rename(columns=rename_map, inplace=True)

# Export to CSV
output_filename = f"{subreddit_name}_reddit_sentiment_data.csv"

# index=False: Prevents Pandas from writing the DataFrame index as a column in the CSV
# encoding='utf-8-sig': improved compatibility
# QUOTE_NONNUMERIC adds quotes to fields that not not purely numeric which helps prevent fields from being misinterpreted due to commas and other things
export_df.to_csv(output_filename, index=False, encoding='utf-8-sig', quoting=csv.QUOTE_NONNUMERIC)

print(f"\nCSV export complete!")
print(f"\nYour data is ready for analysis at: {output_filename}")
print("\nNext steps: Open Your BI Tool of Choice and connect to this CSV file.")