In [9]:
import praw
import datetime
from datetime import datetime, timezone
import time
import pandas as pd
import redditwarp.SYNC
from praw.models import MoreComments
from tqdm import tqdm
from dotenv import load_dotenv
import os

In [10]:
# Load the environment variables from the .env file
load_dotenv('reddit.env')
TOKEN = os.getenv('TOKEN')
SECRET = os.getenv('SECRET')
USER_AGEND = os.getenv('USER_AGEND')


In [11]:
# Create a client instance
client = redditwarp.SYNC.Client()

# Create a reddit instance
reddit = praw.Reddit(
    client_id=TOKEN, 
    client_secret=SECRET, 
    user_agent=USER_AGEND
)

# Set read only
reddit.read_only = True

In [12]:
def get_utc_timestamp(date_str: str) -> int:
    """
    Converts a date string to a UTC Unix timestamp (seconds since the epoch).

    Args:
        date_str (str): A string representing the date in 'YYYY-MM-DD' format.

    Returns:
        int: The corresponding Unix timestamp (in seconds) for the input date, assuming midnight UTC.

    The function parses the provided date string, assumes midnight for the time, sets the timezone to UTC, 
    and converts it to a Unix timestamp, which represents the number of seconds since January 1, 1970.
    """

    local_time = datetime.strptime(date_str, '%Y-%m-%d')

    # Set the timezone to UTC
    utc_time = local_time.replace(tzinfo=timezone.utc)

    # Convert to a Unix timestamp
    timestamp = int(utc_time.timestamp())

    return timestamp




In [13]:
def get_submissions(subreddits:list, search_words:list, start_date:int, end_date:int)->pd.DataFrame:
    """
    Fetches submissions from specified subreddits that contain specific search words and fall within a given date range.

    Args:
        subreddits (list): A list of subreddit names (strings) to search in.
        search_word (list): A list of search words (strings) to filter the submissions.
        start_date (int): The start date as a Unix timestamp (seconds since the epoch).
        end_date (int): The end date as a Unix timestamp (seconds since the epoch).

    Returns:
        pd.DataFrame: A DataFrame containing submission data filtered by subreddit, search word, and date range. 
                      The DataFrame has the following columns:
                      - 'id': The submission ID.
                      - 'date': The submission's creation date as a string in 'YYYY-MM-DD HH:MM:SS' format.
                      - 'title': The submission's title.
                      - 'score': The submission's score.
                      - 'subreddit': The name of the subreddit.
                      - 'topic': The search word/topic used to find the submission.

    This function searches through each subreddit for posts matching the provided search words. It retrieves up to 25
    submissions per search word, filtering by posts that were created within the specified date range. The results are
    returned as a DataFrame, with each submission's key information extracted and stored. The function pauses for 1 second
    between each post retrieval to avoid hitting rate limits.
    """

    result = []
    for subreddit in tqdm(subreddits, desc='Get Subreddits:'):
        for word in search_words: 
            i = 0
            for submission in reddit.subreddit(subreddit).search(word):
                date = submission.created_utc
                if (date >= start_date) and (date <= end_date):

                    data = {}
                    timestamp = datetime.fromtimestamp(date)

                    # Get the submission data
                    data["id"] = submission.id
                    data['date'] = timestamp.strftime('%Y-%m-%d %H:%M:%S')
                    data["title"] = submission.title
                    data['score'] = submission.score
                    data['subreddit'] = subreddit
                    data['topic'] = word

                    result.append(data)
                    i += 1

                    # Just get 25 submissions for each search word in each subreddit
                    if i >= 25:
                        break
                    time.sleep(1)
                else:
                    continue
        
    df_result = pd.DataFrame(result).sort_values(by='score', ascending=False).drop_duplicates(subset='id', keep='first')
    # keep ontly submissions with score > 10
    df_result = df_result[df_result['score'] > 10]

    return df_result


---

In [14]:
def get_comments(df: pd.DataFrame)->pd.DataFrame:
    """
    Fetches the top 25 comments for each post in the provided DataFrame based on the post IDs and 
    returns a DataFrame with the post IDs and their corresponding comments.

    Args:
        df (pd.DataFrame): A DataFrame that contains at least an 'id' column with the post IDs.

    Returns:
        pd.DataFrame: A new DataFrame with two columns:
                    - 'id': The post IDs.
                    - 'comments': A list of up to 25 top comments for each post. If a comment is 
                        deleted or removed, it is replaced with an empty string.
    
    The function iterates through the post IDs in the input DataFrame, fetches the top 25 comments 
    for each post, and handles cases where comments are deleted, removed, or if the request times out.
    """


    results = []

    for x, id in tqdm(enumerate(df['id']), desc='Get comments', total=len(df['id'])):
        try:
            comments_data = []
            tree_node = client.p.comment_tree.fetch(id, sort='top', limit=100)

            # Get the top 25 comments
            for i in range(50):
                if i < len(tree_node.children):
                    c = tree_node.children[i].value

                    # Check if the comment is not deleted or removed
                    if c.body not in ['[deleted]', '[removed]', 'Removed by Reddit']:
                        comments_data.append(c.body)
                    else:
                        continue
                else:
                    continue

            data = {'id': id, 
                    'comments': comments_data}
            results.append(data)
            
        except TimeoutError:
            print("TimeoutError")

        except Exception as e:
            print(f"Error: {e}")
            continue

    
    df_comments = pd.DataFrame(results)

    return df_comments


In [15]:
def clean_comments(comments: list) -> list:
    """
    Cleans the comments by removing new line characters, extra spaces, and converting the text to lowercase.

    Args:
        comments (list): A list of comments (strings) to clean.

    Returns:
        list: A list of cleaned comments with new line characters removed, extra spaces removed, and text converted to lowercase.
    """
    cleaned_comments = []
    for comment in comments:
        if '**User Report**' in comment:
            continue
        cleaned_comment = comment.replace('\n', ' ').strip()
       
        cleaned_comments.append(cleaned_comment)
    return cleaned_comments

In [16]:
start_date = get_utc_timestamp('2017-01-01')
end_date = get_utc_timestamp('2024-10-15')

dax40_companies = [
    "Adidas", "Airbus", "Allianz", "BASF", "Bayer", "Beiersdorf", 
    "BMW", "Brenntag", "Commerzbank", "Continental", "Covestro", 
    "Daimler Truck", "Deutsche Bank", "Deutsche B√∂rse", "Deutsche Post", 
    "Deutsche Telekom", "E.ON", "Fresenius", "Fresenius Medical Care", 
    "Hannover R√ºck", "HeidelbergCement", "Henkel", "Infineon", "Linde", 
    "Mercedes-Benz Group", "Merck", "MTU Aero Engines", "M√ºnchener R√ºck", 
    "Porsche AG", "Puma", "Qiagen", "Rheinmetall", "RWE", "SAP", "Sartorius", 
    "Siemens", "Siemens Healthineers", "Symrise", "Volkswagen", "Zalando", 'Muenchener Rueck', 'Deutsche Boerse', 'Hannover Rueck'
]

#subreddits = ['Aktien', 'news', 'worldnews', 'stocks', 'wallstreetbets', 'finance', 'germany', 'market', 'stockmarket', 'investing', 'europe', 'economy', 'business', 'trading', 'phinvest']
search_word = ['DAX', 'DAX40', 'DAX30'] + dax40_companies
subreddits = ['All']
search_word = ['DAX40', 'DAX30'] + dax40_companies


df_submissions = get_submissions(subreddits, search_word, start_date, end_date)
df_comments = get_comments(df_submissions)

df_merged = pd.merge(df_submissions, df_comments, on='id').sort_values(by='date', ascending=False)
df_merged['comments'] = df_merged['comments'].apply(clean_comments)
display(df_merged.head(10))
print(df_merged.shape)
# This code was rinning for 2 hurs and 30min

Get Subreddits:: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [18:44<00:00, 1124.75s/it]
Get comments: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 649/649 [09:19<00:00,  1.16it/s]


Unnamed: 0,id,date,title,score,subreddit,topic,comments
576,1g3bvom,2024-10-14 11:33:27,Salarii Continental internship,32,All,Continental,[Wow ai expus o situatie ca in orice corporati...
337,1g3903i,2024-10-14 07:33:26,Income-Allianz deal off; government assesses i...,1321,All,Allianz,[These are the people who cared enough to voic...
89,1g2iikt,2024-10-13 07:15:12,I walked from Mexico to Canada on the Continen...,14769,All,Continental,"[Would love to know a more detailed story, foo..."
88,1g26mn5,2024-10-12 20:33:48,Nature‚Äôs Harvest: Capturing the Flow of Rubber...,15271,All,SAP,[What's the purpose of the first vertical cut?...
509,1g1gs54,2024-10-11 20:35:51,First Offer! 2025 Grad at Deutsche Bank,107,All,Deutsche Bank,[I keep trying to spread the word about bank d...
314,1g1ekwq,2024-10-11 18:59:25,Rheinmetall CEO Says Arms Boom Is the Biggest ...,1686,All,Rheinmetall,"[The biggest he's ever seen, so far, every non..."
534,1g19p5w,2024-10-11 15:23:26,Sartorius layoffs,79,All,Sartorius,"[their instruments break all the damn time, I ..."
174,1g0y5th,2024-10-11 03:07:03,Thrifted Adidas and Nike Hoodies I combine,6271,All,Adidas,[Join our Discord here: https://discord.gg/6aa...
296,1g0mhd0,2024-10-10 18:12:40,Deutsche Post erh√§lt Auszeichnung als beste Po...,1975,All,Deutsche Post,[EIne Preisverleihungsgala des Weltpostvereins...
632,1g0ew6u,2024-10-10 11:31:06,Rheiner im Sale üî•,14,All,Rheinmetall,"[Rheiner macht halt Rheiner SachenüòÅ , ist v√∂ll..."


(649, 7)


In [17]:
df_merged.to_csv('reddit_all_data.csv', index=True)