In [None]:
import time
import logging
import os
import praw
import pandas as pd
import re
from prawcore.exceptions import TooManyRequests, RequestException
from dotenv import load_dotenv
from datetime import datetime

# filtros trump
trump_keywords = [
    'donald trump', 'trump', 'donald', 'donaldtrump', 
    'orange', 'duck', 'donaldduck', 'donald duck', 
    'donnybaby', 'donnyboy', 'donnybrook', 'trumpinator', 
    'trumpamaniac', 'trumpastrophie', 'trumpocalypse', 
    'trumpenstein', 'trumpletoes', 'tricky trump', 
    'pumpkin', 'corn', 'president trump', 
    'drumpf', 'the donald', 'mr. trump', 'potus'
]
trump_keywords = [keyword.lower() for keyword in trump_keywords]

# filtros trumpers
trumpers_keywords = [
    'mike pence', 'pence', 'mike', 'mikepence',
    'ron desantis', 'desantis', 'ron', 'rondesantis',
    'marjorie taylor greene', 'mtg', 'greene', 'marjorie',
    'lauren boebert', 'boebert', 'lauren',
    'ted cruz', 'cruz', 'ted', 'tedcruz',
    'mitch mcconnell', 'mcconnell', 'mitch', 'mcconnell',
    'lindsey graham', 'graham', 'lindsey', 'lindseygraham',
    'kevin mccarthy', 'mccarthy', 'kevin', 'kevinmccarthy',
    'rudy giuliani', 'giuliani', 'rudy', 'rudygiuliani',
    'steve bannon', 'bannon', 'steve', 'stevebannon',
    'michael flynn', 'flynn', 'michael', 'michaelflynn',
    'roger stone', 'roger', 'stone', 'rogerstone',
    'matt gaetz', 'gaetz', 'matt', 'mattgaetz',
    'sean hannity', 'hannity', 'sean', 'seanhannity',
]
trumpers_keywords = [keyword.lower() for keyword in trumpers_keywords]

# filtros kamala
kamala_keywords = [
    'kamala harris', 'kamala', 'harris', 'kamalaharris', 
    'vice president harris', 'vp harris', 'kammie', 
    'kammy', 'kamalalal', 'kamalita', 'mrs. harris', 'ms. harris',
    'Comrade Kamala', 'Crazy Kamala', 'Laffin Kamala', 'Lying Kamala Harris', 'Kamabla'
]
kamala_keywords = [keyword.lower() for keyword in kamala_keywords]

# filtros kamalers
kamalers_keywords = [
    'nancy pelosi', 'pelosi', 'nancy', 'nancypelosi',
    'chuck schumer', 'schumer', 'chuck', 'chuckschumer',
    'elizabeth warren', 'warren', 'elizabeth', 'elizabethwarren',
    'bernie sanders', 'sanders', 'bernie', 'berniesanders',
    'aoc', 'alexandria ocasio-cortez', 'ocasio-cortez', 'alexandria',
    'pete buttigieg', 'buttigieg', 'pete', 'petebuttigieg',
    'gavin newsom', 'newsom', 'gavin', 'gavinnewsom',
    'cory booker', 'booker', 'cory', 'corybooker',
    'stacey abrams', 'abrams', 'stacey', 'staceyabrams',
    'hillary', 'michelle obama', 'michelle', 'michelleobama',
    'keisha lance bottoms', 'keisha', 'lance bottoms', 'keisha lance'
]
kamalers_keywords = [keyword.lower() for keyword in kamalers_keywords]

# Logging configuration
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s', 
                    handlers=[logging.StreamHandler(), 
                              logging.FileHandler(r"C:\Users\34616\Documents\4GEEKS\datos_gordos\reddit\Scraping_results\all_hotscraperv2.log", mode='a')])

# Initializing Reddit API
load_dotenv()
user_agent = "political opinion 2.0 by /u/speedylean"
reddit = praw.Reddit(client_id=os.getenv('REDDIT_ID'),
                     client_secret=os.getenv('REDDIT_SECRET'),
                     user_agent=user_agent)

# Functions to check for relevant keywords
def contains_trump_keyword(text):
    return any(keyword in text.lower() for keyword in trump_keywords)

def contains_trumper_keyword(text):
    return any(keyword in text.lower() for keyword in trumpers_keywords)

def contains_kamala_keyword(text):
    return any(keyword in text.lower() for keyword in kamala_keywords)

def contains_kamaler_keyword(text):
    return any(keyword in text.lower() for keyword in kamalers_keywords)

def is_relevant_content(text):
    text = text.lower()
    bot_phrases = [
        "i am a bot", "this bot", "automoderator", 
        "bot created", "beep boop", "bleep bloop", 
        "bot detected", "this is a reminder from the bots",
        "your post has been removed", "your comment has been removed", 
        "moderator action", "subreddit rules", 
        "thank you for your submission", "follow the subreddit rules", 
        "please read our community guidelines", "crosspost", "x-post", "r/"
    ]

    if any(phrase in text for phrase in bot_phrases):
        return False

    if re.search(r'http[s]?://', text):
        return False

    if len(text) < 20 or text in ['thanks', 'lol', 'ok', 'i agree']:
        return False

    return True

# Subreddits to scrape
subreddits = ['politics', 'PoliticalDiscussion', 'Conservative', 'Liberal', 'ModeratePolitics', 'Ask_Politics', 'Democrats', 'Republican']

# DataFrame columns
columns = ['text', 'submission_type', 'subreddit', 'label']

# Rate limiting backoff
initial_backoff = 5
max_backoff = 300
backoff_factor = 2

# Output file with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = rf"C:\Users\34616\Documents\4GEEKS\datos_gordos\reddit\Scraping_results\all_hotscrape_v2p10000_{timestamp}.csv"

def save_progress(df_buffer):
    try:
        if not df_buffer.empty:
            logging.info("Saving current progress...")
            file_exists = os.path.isfile(output_file)
            df_buffer.to_csv(output_file, mode='a', header=not file_exists, index=False)
            logging.info(f"Saved {len(df_buffer)} records successfully.")
    except Exception as e:
        logging.error(f"Error saving progress: {e}")

def backoff_sleep(attempt):
    sleep_time = min(initial_backoff * (backoff_factor ** attempt), max_backoff)
    logging.info(f"Rate limit hit. Sleeping for {sleep_time} seconds...")
    time.sleep(sleep_time)

def scrape_subreddit(subreddit_name, limit=10000):
    records_collected = 0
    save_threshold = 1000
    buffer = []

    subreddit = reddit.subreddit(subreddit_name)
    logging.info(f"Now scraping r/{subreddit_name}")

    # Fetching posts with error handling
    attempt = 0
    while True:
        try:
            posts = list(subreddit.hot(limit=limit))
            logging.info(f"Fetched {len(posts)} posts from r/{subreddit_name}")
            break
        except TooManyRequests as e:
            logging.warning(f"Rate limit hit when fetching posts from r/{subreddit_name}: {e}")
            backoff_sleep(attempt)
            attempt += 1
        except RequestException as e:
            logging.error(f"Request exception when fetching posts from r/{subreddit_name}: {e}")
            backoff_sleep(attempt)
            attempt += 1
        except Exception as e:
            logging.error(f"Unhandled exception when fetching posts from r/{subreddit_name}: {e}")
            break

    for post in posts:
        attempt = 0
        while True:
            try:
                post.comments.replace_more(limit=None)
                comments = post.comments.list()
                break
            except TooManyRequests as e:
                logging.warning(f"Rate limit hit when fetching comments: {e}")
                backoff_sleep(attempt)
                attempt += 1
            except RequestException as e:
                logging.error(f"Request exception when fetching comments: {e}")
                backoff_sleep(attempt)
                attempt += 1
            except Exception as e:
                logging.error(f"Unhandled exception when fetching comments: {e}")
                break
        
        # Processing comments
        for comment in comments:
            if not is_relevant_content(comment.body):
                continue

            label = ''
            if contains_trump_keyword(comment.body):
                label = 'Trump'
            elif contains_trumper_keyword(comment.body):
                label = 'Trumpers'
            elif contains_kamala_keyword(comment.body):
                label = 'Kamala'
            elif contains_kamaler_keyword(comment.body):
                label = 'Kamalers'
            else:
                continue

            record = {
                'text': comment.body,
                'submission_type': 'comment',
                'subreddit': subreddit_name,
                'label': label
            }

            buffer.append(record)
            records_collected += 1

            # Save progress every save_threshold records
            if records_collected >= save_threshold:
                df_buffer = pd.DataFrame(buffer, columns=columns)
                save_progress(df_buffer)
                buffer = []
                records_collected = 0

    # Save remaining records
    if buffer:
        df_buffer = pd.DataFrame(buffer, columns=columns)
        save_progress(df_buffer)

def main(subreddits):
    for subreddit in subreddits:
        scrape_subreddit(subreddit, limit=10000)
        logging.info(f"Finished scraping r/{subreddit}")

    logging.info("Scraping complete.")


main(subreddits)
