In [3]:
import praw
import pandas as pd

# Reddit API Credentials
reddit = praw.Reddit(
    client_id="_9aT35rxVcBiNC17_Al5_g",
    client_secret="tyas9JGsMfhm4Jhifg86JnvpHZHx6A",
    user_agent="CryptoSentimentApp"
)

def fetch_reddit_posts(coin, num_posts):
    """
    Fetches Reddit posts related to a cryptocurrency and returns data in a Pandas DataFrame.
    
    Parameters:
    - coin (str): The cryptocurrency to search for.
    - num_posts (int): Number of posts to retrieve.

    Returns:
    - pd.DataFrame: DataFrame containing post details.
    """
    subreddit = reddit.subreddit("cryptocurrency")
    query = coin.lower()  
    
    
    posts_data = []
    for post in subreddit.search(query, limit=num_posts):
        posts_data.append([
            post.id,
            post.title,
            post.selftext,  # Post content
            post.score,  # Upvotes
            post.num_comments,  # Number of comments
            post.url,  # Post link
            post.created_utc  # Timestamp
        ])

    
    df = pd.DataFrame(posts_data, columns=["Post_ID", "Title", "Content", "Upvotes", "Comments", "URL", "Timestamp"])
    return df


df_bitcoin_100 = fetch_reddit_posts("Bitcoin", 1000)
print(df_bitcoin_100.head())
df_bitcoin_100.to_csv('csv_bitcoin_100.csv')


   Post_ID                                              Title  \
0  1gqafju  Bitcoin has followed a consistent 4-year cycle...   
1  1iq3fe9         Me In 2009 Instead of Buying Bitcoin (BTC)   
2  1h6yoqp  On February 9th 2011 Bitcoin first touched $1....   
3  1ik2qgu  Explaining Bitcoin 12 Years Ago When It Was Wo...   
4  1hbsf6a  This Anonymous guy received $50 worth of Bitco...   

                        Content  Upvotes  Comments  \
0                                   3572       700   
1                                  17764       309   
2                                   7822       495   
3                                   8419       293   
4  Imagine hodling for 13 Years     7415       452   

                                    URL     Timestamp  
0  https://i.redd.it/95px1ns8in0e1.jpeg  1.731496e+09  
1   https://i.redd.it/denbcysakbje1.png  1.739632e+09  
2   https://i.redd.it/m7ll0go40y4e1.png  1.733366e+09  
3   https://i.redd.it/hd0ul5cglrhe1.png  1.738955e+09  
4  h

In [4]:
import pandas as pd
import re
from datetime import datetime

def clean_reddit_data(df):
    """
    Cleans Reddit dataset by fixing inconsistencies, formatting text, and removing duplicates.
    
    Parameters:
    - df (pd.DataFrame): Raw Reddit dataset
    
    Returns:
    - pd.DataFrame: Cleaned dataset
    """
    
    # 1️⃣ Fix Data Inconsistencies (Convert Timestamp, Standardize Column Names)
    df = df.copy()
    df.columns = df.columns.str.lower().str.replace(" ", "_")  # Standardize column names
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")  # Convert UNIX timestamp to datetime
    
    # 2️⃣ Uniform Formatting (Lowercasing & Removing Special Characters)
    def clean_text(text):
        if isinstance(text, str):
            text = text.lower()  # Convert to lowercase
            text = re.sub(r"http\S+", "", text)  # Remove URLs
            text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
            text = text.strip()  # Remove leading/trailing spaces
        return text

    df["title"] = df["title"].apply(clean_text)
    df["content"] = df["content"].apply(clean_text)
    
    # 3️⃣ Remove Duplicates
    df.drop_duplicates(subset=["title"], inplace=True)  # Remove duplicate titles
    df.drop_duplicates(subset=["url"], inplace=True)  # Remove duplicate URLs

    df = df.drop(columns=["unnamed:_0"])

    return df

df_bitcoin_100 = pd.read_csv('csv_bitcoin_100.csv')  # Load the dataset
df_clean = clean_reddit_data(df_bitcoin_100)  # Clean the dataset
print(df_clean.head())  # Display cleaned data


   post_id                                              title  \
0  1gqafju  bitcoin has followed a consistent 4year cycle ...   
1  1iq3fe9           me in 2009 instead of buying bitcoin btc   
2  1h6yoqp  on february 9th 2011 bitcoin first touched 1 l...   
3  1ik2qgu  explaining bitcoin 12 years ago when it was wo...   
4  1hbsf6a  this anonymous guy received 50 worth of bitcoi...   

                        content  upvotes  comments  \
0                           NaN     3572       700   
1                           NaN    17764       309   
2                           NaN     7822       495   
3                           NaN     8419       293   
4  imagine hodling for 13 years     7415       452   

                                    url           timestamp  
0  https://i.redd.it/95px1ns8in0e1.jpeg 2024-11-13 11:04:46  
1   https://i.redd.it/denbcysakbje1.png 2025-02-15 15:13:54  
2   https://i.redd.it/m7ll0go40y4e1.png 2024-12-05 02:39:33  
3   https://i.redd.it/hd0ul5cglrhe1.