In [7]:
import pandas as pd
import time
import logging
import os
import praw
import re
from prawcore.exceptions import TooManyRequests, RequestException
from dotenv import load_dotenv
from datetime import datetime

In [8]:
democrat_keywords = [
    # General Democrat references
    'democrat', 'dems', 'liberal', 'libs', 'left-wing', 'leftist', 'snowflake', 
    'blue wave', 'democratic party', 'progressive', 'woke', 'sjw', 'social justice warrior',

    # Joe Biden
    'biden', 'joe biden', 'sleepy joe', 'creepy joe', 'joey', 'the big guy', 'potato', 
    'senile joe', 'bidenflation', 'brandon', 'lets go brandon', 'bribem', 

    # Kamala Harris
    'kamala', 'harris', 'kamala harris', 'heels up', 'heels up harris', 'knee pads', 'cameltoe harris', 
    'kamal', 'kamalala', 'mala mala', 'cackling harris', 'veep harris', 'madam vp', 'vp harris',

    # Nancy Pelosi
    'pelosi', 'nancy pelosi', 'crazy nancy', 'auntie nancy', 'nancypants', 'nasty nancy', 'pelo-clown', 

    # Barack Obama
    'obama', 'barack obama', 'obummer', 'nobama', 'barry', 'barry soetoro',

    # Alexandria Ocasio-Cortez
    'aoc', 'alexandria ocasio-cortez', 'ocrazio', 'aocrazio', 'ocasiotard', 'aocloon', 'aoc clown', 'green new deal girl', 
    'sandy', 'bartender',

    # Bernie Sanders
    'bernie', 'bernie sanders', 'comrade sanders', 'bernout', 'feel the bern', 'grandpa socialism', 'old bernie', 'crazy bernie',

    # Elizabeth Warren
    'elizabeth warren', 'warren', 'pocahontas', 'fauxcahontas', 'lizzie warren', 'warren the warrior', 'chief warren',

    # Other prominent Democrats
    'hillary', 'hillary clinton', 'crooked hillary', 'killary', 'clinton', 'chelsea clinton', 'the clintons', 
    'adam schiff', 'schifty schiff', 'shifty schiff', 'jerry nadler', 'fat jerry',

    # Voters/supporters
    'dem voter', 'democrats supporter', 'lib voter', 'woke mob', 'sjw army', 'antifa', 'lefty', 'blm', 'black lives matter', 
    'democrat loyalist', 'progressive left', 'radical left', 'anarchist', 'feminazi', 'the squad'
]

democrat_keywords = [keyword.lower() for keyword in democrat_keywords]

republican_keywords = [
    # General Republican references
    'republican', 'gop', 'right-wing', 'rightie', 'conservative', 'maga', 'red wave', 'republican party', 
    'patriot', 'nationalist', 'the right', 'alt-right', 'alt right', 'chud', 'repubtard', 'repugs', 

    # Donald Trump
    'trump', 'donald trump', 'donald', 'the donald', 'orange man', 'orangutan', 'orangeman bad', 'drumpf', 'trumpster', 
    'trumptard', 'trumpkin', 'the cheeto', 'cheeto jesus', 'maga king', 'god emperor', 'tangerine tyrant', '45', 
    'donald dump', 'trumpanzee', 'trumplethinskin', 'trumpenstein', 'orange'

    # Other Trump family members
    'melania', 'melania trump', 'ivanka', 'ivanka trump', 'eric trump', 'donald jr', 'don jr', 'tiffany trump', 'barron trump',

    # Ted Cruz
    'ted cruz', 'lyin ted', 'cruz missile', 'creepy cruz', 'texas senator', 

    # Mitch McConnell
    'mitch mcconnell', 'mcconnell', 'mitch the turtle', 'mitch', 'moscow mitch', 'the turtle',

    # Ron DeSantis
    'ron desantis', 'desantis', 'deathsantis', 'florida man', 'governor desantis', 'ron the con', 'rondesantis', 'rondan', 

    # Lindsey Graham
    'lindsey graham', 'lady g', 'graham cracker', 'closet graham', 'senator graham', 'miss lindsey', 

    # Other prominent Republicans
    'mike pence', 'pence', 'iron mike', 'deputy dog', 'kevin mccarthy', 'sean hannity', 'tucker carlson', 
    'matt gaetz', 'gaetz', 'fox news', 'gutfeld', 'tom cotton', 'josh hawley', 'lauren boebert', 'marjorie taylor greene', 
    'mgt', 'green new deal reject',

    # Voters/supporters
    'maga voter', 'maga mob', 'trump voter', 'trumpist', 'patriot', 'republican loyalist', 'trump supporter', 
    'deplorable', 'qanon', 'trump army', 'right-winger', 'christian conservative', 'militia', 'gun rights', '2nd amendment'
]

republican_keywords = [keyword.lower() for keyword in republican_keywords]

In [9]:
import re
# Logging configuration
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s', 
                    handlers=[logging.StreamHandler(), 
                              logging.FileHandler(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\reddit\Scraping_results\final_scraper.log", mode='a')])


# Functions to check for relevant keywords
def contains_keywords(text, keyword_list):
    for keyword in keyword_list:
        # si la keyword es una frase, buscar la frase entera
        if ' ' in keyword:
            if keyword.lower() in text.lower():
                return True
        else:
            # si la keyword es una palabra, buscamos únicamente la palabra entera (no substrings) 
            if re.search(r'\b' + re.escape(keyword) + r'\b', text.lower()):
                return True
    return False

def contains_republican_keyword(text):
    return contains_keywords(text, republican_keywords)

def contains_democrat_keyword(text):
    return contains_keywords(text, democrat_keywords)

def is_relevant_content(text):
    text = text.lower()
    bot_phrases = [
        "i am a bot", "this bot", "automoderator", 
        "bot created", "beep boop", "bleep bloop", 
        "bot detected", "this is a reminder from the bots",
        "your post has been removed", "your comment has been removed", 
        "moderator action", "subreddit rules", 
        "thank you for your submission", "follow the subreddit rules", 
        "please read our community guidelines", "crosspost", "x-post", "r/"
    ]

    if any(phrase in text for phrase in bot_phrases):
        return False

    if re.search(r'http[s]?://', text):
        return False

    if len(text) < 20 or text in ['thanks', 'lol', 'ok', 'i agree']:
        return False

    return True

def relabel_posts(df):
    new_labels = []

    for text in df['text']:
        democrat_found = contains_democrat_keyword(text)
        republican_found = contains_republican_keyword(text)

        if democrat_found and republican_found:
            new_labels.append('Both')
        elif democrat_found:
            new_labels.append('Democrat')
        elif republican_found:
            new_labels.append('Republican')
        else:
            new_labels.append('Neutral')
    
    # Update the DataFrame with the new labels
    df['labels'] = new_labels
    return df

In [10]:
# Subreddits to scrape
subreddits = ['politics', 'PoliticalDiscussion', 'Conservative', 'Liberal', 'ModeratePolitics', 'Ask_Politics', 'Democrats', 'Republican']

# DataFrame columns
columns = ['text', 'submission_type', 'subreddit', 'label']

# Rate limiting backoff
initial_backoff = 5
max_backoff = 300
backoff_factor = 2



output_file = r"C:\Users\34616\Documents\4GEEKS\datos_gordos\reddit\Scraping_results\final_scraping_result.csv"

def save_progress(df_buffer):
    try:
        if not df_buffer.empty:
            logging.info("Saving current progress...")
            file_exists = os.path.isfile(output_file)
            df_buffer.to_csv(output_file, mode='a', header=not file_exists, index=False)
            logging.info(f"Saved {len(df_buffer)} records successfully.")
    except Exception as e:
        logging.error(f"Error saving progress: {e}")

def backoff_sleep(attempt):
    sleep_time = min(initial_backoff * (backoff_factor ** attempt), max_backoff)
    logging.info(f"Rate limit hit. Sleeping for {sleep_time} seconds...")
    time.sleep(sleep_time)


In [13]:
# Initializing Reddit API
load_dotenv()
user_agent = "final scrape /u/speedylean"
reddit = praw.Reddit(client_id=os.getenv('REDDIT_ID'),
                     client_secret=os.getenv('REDDIT_SECRET'),
                     user_agent=user_agent)


In [14]:


def scrape_subreddit(subreddit_name, limit=None):
    records_collected = 0
    save_threshold = 1000
    buffer = []

    subreddit = reddit.subreddit(subreddit_name)
    logging.info(f"Now scraping r/{subreddit_name}")

    # Fetching posts with error handling
    attempt = 0
    while True:
        try:
            posts = list(subreddit.hot(limit=limit))
            logging.info(f"Fetched {len(posts)} posts from r/{subreddit_name}")
            break
        except TooManyRequests as e:
            logging.warning(f"Rate limit hit when fetching posts from r/{subreddit_name}: {e}")
            backoff_sleep(attempt)
            attempt += 1
        except RequestException as e:
            logging.error(f"Request exception when fetching posts from r/{subreddit_name}: {e}")
            backoff_sleep(attempt)
            attempt += 1
        except Exception as e:
            logging.error(f"Unhandled exception when fetching posts from r/{subreddit_name}: {e}")
            break

    for post in posts:
        attempt = 0
        while True:
            try:
                post.comments.replace_more(limit=None)
                comments = post.comments.list()
                break
            except TooManyRequests as e:
                logging.warning(f"Rate limit hit when fetching comments: {e}")
                backoff_sleep(attempt)
                attempt += 1
            except RequestException as e:
                logging.error(f"Request exception when fetching comments: {e}")
                backoff_sleep(attempt)
                attempt += 1
            except Exception as e:
                logging.error(f"Unhandled exception when fetching comments: {e}")
                break
        
        # Processing comments
        for comment in comments:
            if not is_relevant_content(comment.body):
                continue

            label = ''
            if contains_democrat_keyword(comment.body):
                label = 'Democrat'
            elif contains_republican_keyword(comment.body):
                label = 'Republican'

            else:
                continue

            record = {
                'text': comment.body,
                'submission_type': 'comment',
                'subreddit': subreddit_name,
                'label': label
            }

            buffer.append(record)
            records_collected += 1

            # Save progress every save_threshold records
            if records_collected >= save_threshold:
                df_buffer = pd.DataFrame(buffer, columns=columns)
                save_progress(df_buffer)
                buffer = []
                records_collected = 0

    # Save remaining records
    if buffer:
        df_buffer = pd.DataFrame(buffer, columns=columns)
        save_progress(df_buffer)

In [12]:
def main(subreddits):
    for subreddit in subreddits:
        scrape_subreddit(subreddit, limit=10000)
        logging.info(f"Finished scraping r/{subreddit}")

    logging.info("Scraping complete.")


main(subreddits)

2024-09-12 19:06:08,627 - INFO - Now scraping r/politics
2024-09-12 19:06:17,901 - INFO - Fetched 810 posts from r/politics
