In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class ContentCleaner:
    def __init__(self):
        # Download required NLTK data
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        
        # Initialize blacklist of terms to filter out
        self.blacklist = {
            # Technical web/file terms
            'http', 'https', 'www', 'com', 'html', 'htm',
            'amp', 'rt', 'url', 'href', 'src', 'png', 'jpg',
            'jpeg', 'gif', 'pdf', 'xml', 'php', 'asp', 'js',
            'css', 'img', 'pic', 'download', 'click', 'link',
            
            # Platform-specific terms
            'reddit', 'youtube', 'twitter', 'facebook', 'instagram',
            'upvote', 'downvote', 'karma', 'post', 'thread', 
            'subreddit', 'edit', 'tldr', 'subscribe', 'channel',
            'video', 'tweet', 'comment', 'mod', 'moderator',
            
            # Common stopwords
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
            'you', 'your', 'yours', 'yourself', 'yourselves',
            'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
            'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
            'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
            'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
            'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
            'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
            'against', 'between', 'into', 'through', 'during', 'before',
            'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
            'out', 'on', 'off', 'over', 'under', 'again', 'further',
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
            'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
            'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
            'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
            'don', 'should', 'now',
            
            # Numbers and basic words
            'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
            'first', 'second', 'third', 'fourth', 'fifth',
            'would', 'like', 'much', 'more', 'most', 'some', 'any', 'every', 'all'
        }

    def clean_text(self, text):

        # Convert to lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Tokenize
        word_tokens = word_tokenize(text)
        
        # Remove blacklisted words
        filtered_words = [word for word in word_tokens if word not in self.blacklist]
        
        # Join words back together
        return ' '.join(filtered_words)

    def clean_dataframe(self, df):

        cleaned_df = df.copy()
        
        # Clean text content
        cleaned_df['cleaned_content'] = cleaned_df[2].apply(self.clean_text)
        
        return cleaned_df

# Read and clean data
cleaner = ContentCleaner()

    # Read CSV file without headers
    df = pd.read_csv('reddit_fomo_data.csv', header=None)
    
    # Clean the data
    cleaned_df = cleaner.clean_dataframe(df)
    
    # Save cleaned data
    cleaned_df.to_csv('Analysis/c_reddit_fomo_data.csv', index=False)
    
    print("Data cleaning completed successfully")
    print("\nFirst few rows of cleaned content:")
    print(cleaned_df['cleaned_content'].head())
    


Data cleaning completed successfully

First few rows of cleaned content:
0    letter fomo dear friend doesn suffer fomo ment...
1    fomo life m ve never clubbing proper group fri...
2    happy cakeday r fomo today re let look back me...
3    anyone covid fomo feel m missing many social t...
4    deal fomo hey everyone ve dealing fomo ever si...
Name: cleaned_content, dtype: object
