Clean Q2 reddit files

In [7]:
import csv
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Cleans the input text by removing unwanted characters,
    normalizing whitespace, and lemmatizing words.
    
    Args:
        text (str): The input text to clean.
    
    Returns:
        str: The cleaned text.
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    words = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join the words back into a single string
    return ' '.join(words)

def preprocess_reddit_data(filename):
    """
    Reads Reddit data from a CSV file, cleans the posts and comments,
    and returns a DataFrame suitable for analysis. Only cleans post_title and comment_body.
    
    Args:
        filename (str): The path to the CSV file containing Reddit data.
    
    Returns:
        pd.DataFrame: A DataFrame containing cleaned posts and comments without duplicating post titles.
    """
    # Load the data from CSV
    df = pd.read_csv(filename)

    # Initialize a list to store processed data
    structured_data = []

    # Track post titles to avoid duplication
    post_titles_added = set()

    # Clean only the post titles and comments
    for index, row in df.iterrows():
        cleaned_post_title = clean_text(row['post_title'])

        # Add post title only if it hasn't been added before
        if cleaned_post_title not in post_titles_added:
            structured_data.append({
                'type': 'post',
                'post_title': cleaned_post_title,
                'post_created_date': row['post_created_date'],
                'post_url': row['post_url'],
                'post_subreddit': row['post_subreddit'],
                'comment_body': '',  # Empty for post rows
                'comment_author': '',  # Empty for post rows
                'comment_created_date': ''  # Empty for post rows
            })
            post_titles_added.add(cleaned_post_title)

        # Clean and add comments only if there's a valid comment body
        comment_body = row['comment_body']
        comment_author = row['comment_author']
        comment_created_date = row['comment_created_date']

        if isinstance(comment_body, str) and comment_body.strip():  # Ensure there's a comment
            structured_data.append({
                'type': 'comment',
                'post_title': '',  # Empty for comment rows to avoid duplication
                'post_created_date': '',  # Empty for comment rows
                'post_url': '',  # Empty for comment rows
                'post_subreddit': row['post_subreddit'],  # Keep subreddit for comments
                'comment_body': clean_text(comment_body),
                'comment_author': comment_author,
                'comment_created_date': comment_created_date
            })

    # Convert to DataFrame for easier manipulation
    structured_df = pd.DataFrame(structured_data)

    return structured_df


reddit_data_filename = 'Q2/reddit_asian.csv'  # Update with your file path
cleaned_reddit_data = preprocess_reddit_data(reddit_data_filename)

# Save cleaned data to a new CSV file
cleaned_reddit_data.to_csv('Q2/cleaned_reddit_asian.csv', index=False)

print("Cleaned Reddit data saved to 'cleaned_reddit_western.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cleaned Reddit data saved to 'cleaned_reddit_western.csv'.
