In [1]:
import praw
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import csv
import json
from datetime import datetime, timedelta
import time
import os
import pickle  # For saving and loading the persistent count

In [2]:
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/pky.macpro/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
reddit = praw.Reddit(
    client_id='gxUTrYGyJ8Y2NntR5kOZQA',
    client_secret='dlsKv6hx2KUOvRvZzvxroSOmzOE84g',
    user_agent='sentiment analysis for upcoming election'
)

In [4]:
save_directory = "../DBdata/"
count_file_path = os.path.join(save_directory, "count.pkl")
NUM_DATABASES = 4

In [5]:
def load_or_initialize_count():
    today = datetime.now().strftime('%m%d%Y')
    if os.path.exists(count_file_path):
        with open(count_file_path, 'rb') as file:
            last_date, count = pickle.load(file)
            if last_date == today:
                return count
    return 1  # Reset or initialize

In [6]:
def update_count(count):
    today = datetime.now().strftime('%m%d%Y')
    with open(count_file_path, 'wb') as file:
        pickle.dump((today, count), file)

In [7]:
def hash_index(text):
    return sum(ord(c) for c in text) % (NUM_DATABASES - 1)

In [8]:
def analyze_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

In [9]:
subreddit_name = 'politics'
keywords = 'Biden'
max_comments = 5
start_time = datetime.now()
time_limit = timedelta(minutes=3)
count = load_or_initialize_count()
submissions = {'submission': {}}

In [10]:
subreddit = reddit.subreddit(subreddit_name)

In [11]:
for submission in subreddit.hot(limit=100):  # Adjust the limit as necessary
    if any(keyword.lower() in submission.title.lower() for keyword in keywords):
        submission.comments.replace_more(limit=0)  # Load all comments
        for comment in submission.comments.list():
            if len(submissions['submission']) >= max_comments or datetime.now() - start_time > time_limit:
                print(f"Extraction ended. {len(submissions)} comments extracted.")
                print("Now executing the data save process.")
                update_count(count)
                break
            if comment.id not in submissions['submission']:
                submissions['submission'][comment.id] = {
                    'secondary_data_id': datetime.now().strftime('%m%d%Y') + f'_{len(submissions) + 1:03}',
                    'body': comment.body,
                    'title': submission.title,
                    'subreddit': subreddit_name,
                    'upvotes': comment.ups,
                    'downvotes': comment.downs,
                    'timestamp': comment.created_utc,
                    'permalink': f'https://reddit.com{comment.permalink}',
                    'sentiment_score': analyze_sentiment(comment.body),
                    'hash_index': hash_index(comment.body)
                }
                count += 1
                

Extraction ended. 1 comments extracted.
Now executing the data save process.
Extraction ended. 1 comments extracted.
Now executing the data save process.
Extraction ended. 1 comments extracted.
Now executing the data save process.
Extraction ended. 1 comments extracted.
Now executing the data save process.
Extraction ended. 1 comments extracted.
Now executing the data save process.
Extraction ended. 1 comments extracted.
Now executing the data save process.


KeyboardInterrupt: 

In [12]:
print(f"Extraction ended. {len(submissions['submission'])} comments extracted.")
print("Now executing the data save process.")
update_count(count)

Extraction ended. 5 comments extracted.
Now executing the data save process.


In [13]:
submissions

{'submission': {'kzs009m': {'secondary_data_id': '04162024_002',
   'body': 'Iowa Governor Kim Reynolds, a woman so vile she makes Dolores Umbridge look sweet and kind, announced a $900,000 grant to feed impoverished children over the summer. A grant that now every district in the state must compete for. This is after she turned down $29 million in federal funds to do the same thing.',
   'title': 'The "What happened in your state last week?" Megathread, Week 15',
   'subreddit': 'politics',
   'upvotes': 17,
   'downvotes': 0,
   'timestamp': 1713236407.0,
   'permalink': 'https://reddit.com/r/politics/comments/1c4yluj/the_what_happened_in_your_state_last_week/kzs009m/',
   'sentiment_score': 0.632,
   'hash_index': 0},
  'kztdzzc': {'secondary_data_id': '04162024_002',
   'body': "**Maryland**\n\nBaltimore Mayor Brandon Scott and State’s Attorney Ivan Bates [are beefing,](https://www.thebaltimorebanner.com/politics-power/local-government/brandon-scott-ivan-bates-feud-7ODZNHXFGJCPBNTN

In [13]:
len(submissions['submission'])

5

In [14]:
databases = {i: {} for i in range(NUM_DATABASES)}

In [None]:
for id, data in submissions['submission'].items():
    index = data['hash_index']
    databases[index][id] = data

In [None]:
databases

In [None]:
for index, data_dict in databases.items():
    json_path = os.path.join(save_directory, f'database{index}.json')
    csv_path = os.path.join(save_directory, f'database{index}.csv')

    if os.path.exists(json_path):
        with open(json_path, 'r+') as file:
            existing_data = json.load(file)
            existing_data.update(submissions)
            file.seek(0)
            file.truncate()
            json.dump(existing_data, file, indent=4)
    else:
        with open(json_path, 'w') as file:
            json.dump(data_dict, file, indent=4)

    if data_dict:
        keys = data_dict[next(iter(data_dict))].keys()  # Get keys from the first element
        if os.path.exists(csv_path):
            with open(csv_path, 'a', newline='') as file:
                writer = csv.DictWriter(file, fieldnames=keys)
                for data in data_dict.values():
                    writer.writerow(data)
        else:
            with open(csv_path, 'w', newline='') as file:
                writer = csv.DictWriter(file, fieldnames=keys)
                writer.writeheader()
                for data in data_dict.values():
                    writer.writerow(data)


In [14]:
for id, data_dict in submissions['submission'].items():
    index = data_dict['hash_index']
    json_path = os.path.join(save_directory, f'database{index}.json')

    if os.path.exists(json_path):
        with open(json_path, 'r+') as file:
            existing_data = json.load(file)
            existing_data.update(submissions)
            file.seek(0)
            file.truncate()
            json.dump(existing_data, file, indent=4)
    else:
        with open(json_path, 'w') as file:
            json.dump(submissions, file, indent=4)

print(f"Data saving completed. Total {len(submissions['submission'])} comments saved across databases.")


Data saving completed. Total 5 comments saved across databases.


In [None]:
len(submissions['submission'])

In [None]:
for id, data in submissions['submission'].items():
    index = data['hash_index']
    databases[index].append(data)
    databases[NUM_DATABASES - 1].append(data)  # Replicated data

In [None]:
databases.items()

In [None]:
for index, data in databases.items():
    json_path = os.path.join(save_directory, f'database{index}.json')
    csv_path = os.path.join(save_directory, f'database{index}.csv')

    if os.path.exists(json_path):
        with open(json_path, 'r+') as file:
            existing_data = json.load(file)
            existing_data.extend(data)
            file.seek(0)
            file.truncate()
            json.dump(existing_data, file, indent=4)
    else:
        with open(json_path, 'w') as file:
            json.dump(data, file, indent=4)

    # Write CSV
    keys = data[0].keys() if data else []
    if os.path.exists(csv_path):
        with open(csv_path, 'a', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=keys)
            writer.writerows(data)
    else:
        with open(csv_path, 'w', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=keys)
            writer.writeheader()
            writer.writerows(data)

print(f"Data saving completed. Total {len(submissions['submission'])} comments saved across databases.")

In [None]:
if __name__ == "__main__":
    subreddits = input("Would you want to specify the subreddit?: ").split(',')
    keywords = input("Any keywords for searching?: ").split(',')
    max_comments = int(input("How many searches do I execute? Please enter number only.:"))
    print(f"Search for {max_comments} reddit comments.")
    comments = fetch_comments(subreddits, keywords, max_comments)
    save_to_files(comments)
    print(f"Operation completed. {len(comments)} comments were found and processed.")
