In [9]:
import csv
import re
from collections import Counter
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')  # for lemmatization
nltk.download('punkt')  # for tokenization

# Initialize NLTK components
tweetTokenizer = nltk.tokenize.TweetTokenizer()
tweetLemmatizer = WordNetLemmatizer()
lPunct = list(string.punctuation)
lStopwords = stopwords.words('english') + lPunct + ['via']

# Define irrelevant terms to filter out
irrelevant_terms = {'im', 'i\'m', 'one', 'tri', 'thi', 'ani', 'it', 'it\'', 'ha', 'also', 'ab', 'get', 'thank', 
                    'go', 'ask', 'think', 'like', 'feel', 'would', 'wa', '...', '<br>', 'href', '</a>', '“', '”', 
                    'u', '❤', '😂', '<3', '>', '<', '.', ',', '!', '@', '#', '$', '%', '^', '&', '*', 
                    '(', ')', 'you', 'i', 'the', 'is', 'a', 'that', 'to', 'and', 'of', 'in',
                    'for', 'on', 'with', 'as', 'are', 'this', 'was', 'by', 'at', 'be', 'but',
                    'not', 'from', 'an', 'or', 'which', 'that', 'we', 'so', 'if', 'they', 'he',
                    'she', 'their', 'my', 'his', 'her', 'its', 'there', 'when', 'who', 'what',
                    'where', 'how', 'than', 'then', 'because', 'more', 'like', 'these', 'just'}

# Load CSV file
fCsvName = 'Q1/preprocessed_unrealistic_asian.csv'

# Set the number of common terms to display
freqNum = 50

# Initialize NLTK components
tweetTokeniser = nltk.tokenize.TweetTokenizer()
lPunct = list(string.punctuation)
lStopwords = stopwords.words('english') + lPunct + ['via']
tweetLemmatizer = WordNetLemmatizer()
# Initialize term frequency counter and processed posts list
termFreqCounter = Counter()
processedPosts = []

# Text preprocessing function
def processText(text, tokenizer, lemmatizer, stopwords):
    text = text.lower()
    lTokens = tokenizer.tokenize(text)
    lTokens = [token.strip() for token in lTokens]
    lLemmatizedTokens = set([lemmatizer.lemmatize(tok) for tok in lTokens])
    return [tok for tok in lLemmatizedTokens if tok not in stopwords and not tok.isdigit()]

# Process CSV file
with open(fCsvName, 'r', newline='', encoding='utf-8') as f:
    csv_reader = csv.DictReader(f)
    
    for submission in csv_reader:
        # Process post title
        postTitle = submission.get('post_title', '')
        postTitle = re.sub(u"(\u2018|\u2019|\u2014)", "", postTitle)
        lTokens = processText(text=postTitle, tokenizer=tweetTokeniser, lemmatizer=tweetLemmatizer, stopwords=lStopwords)
        filteredTokens = [token for token in lTokens if token not in irrelevant_terms]
        termFreqCounter.update(filteredTokens)
        processedPosts.append(" ".join(filteredTokens)) 
        
        # Process comment body
        commentBody = submission.get('comment_body', '')
        commentBody = re.sub(u"(\u2018|\u2019|\u2014)", "", commentBody)
        lTokens = processText(text=commentBody, tokenizer=tweetTokeniser, lemmatizer=tweetLemmatizer, stopwords=lStopwords)
        filteredTokens = [token for token in lTokens if token not in irrelevant_terms]
        termFreqCounter.update(filteredTokens)
        processedPosts.append(" ".join(filteredTokens)) 
            
       
for term, count in termFreqCounter.most_common(freqNum):
    print(f"{term}: {count}")

# Save preprocessed posts to a new CSV file
with open('preprocessed_asian_stereotypes_reddit_posts.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Processed Text'])  # Header
    for post in processedPosts:
        writer.writerow([post])

print("Processed posts saved to 'preprocessed_asian_stereotypes_reddit_posts.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed posts saved to 'preprocessed_asian_stereotypes_reddit_posts.csv'.


In [17]:
import csv
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')  # for lemmatization
nltk.download('punkt')  # for tokenization

# Initialize NLTK components
tweetLemmatizer = WordNetLemmatizer()
lStopwords = stopwords.words('english') + list(string.punctuation) + ['via']

# Load existing CSV file
fCsvName = 'Q1/preprocessed_unrealistic_western_stereotype__data.csv'
outputCsvName = 'preprocessed_cleaned_western_stereotypes.csv'

# Text preprocessing function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove special symbols and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lower case
    text = text.lower()
    return text

# Initialize a list for processed texts
processed_texts = []

# Process the existing CSV file
with open(fCsvName, 'r', newline='', encoding='utf-8') as f:
    csv_reader = csv.DictReader(f)
    
    for row in csv_reader:
        original_text = row.get('Processed Text', '')
        cleaned_text = preprocess_text(original_text)
        processed_texts.append(cleaned_text)

# Save the cleaned processed texts to a new CSV file
with open(outputCsvName, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Processed Text'])  # Header
    for text in processed_texts:
        writer.writerow([text])

print(f"Processed texts saved to '{outputCsvName}'.")


Processed texts saved to 'preprocessed_cleaned_western_stereotypes.csv'.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import csv
import json
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Lemmatizer and Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Tokenize, lemmatize, and remove stop words
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    # Join the tokens back into a string
    return ' '.join(tokens)

# Function to save preprocessed data to CSV
def save_preprocessed_to_csv(posts, filename='clean_reddit_data.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        # Write header in CSV
        writer.writerow(['post_title', 'post_score', 'post_created_date', 'post_url', 'comment_body', 'comment_created_date'])

        # Write cleaned data
        for post in posts:
            for comment in post['comments']:
                writer.writerow([
                    preprocess_text(post['title']),       # Cleaned title
                    post['score'],
                    post['created_date'],
                    post['url'],
                    preprocess_text(comment['body']),     # Cleaned comment
                    comment['created_date']
                ])

# Function to save preprocessed data to JSON
def save_preprocessed_to_json(posts, filename='clean_reddit_data.json'):
    cleaned_posts = []

    for post in posts:
        cleaned_post = {
            'title': preprocess_text(post['title']),  # Cleaned title
            'score': post['score'],
            'created_date': post['created_date'],
            'url': post['url'],
            'comments': []
        }

        for comment in post['comments']:
            cleaned_comment = {
                'body': preprocess_text(comment['body']),  # Cleaned comment
                'created_date': comment['created_date']
            }
            cleaned_post['comments'].append(cleaned_comment)

        cleaned_posts.append(cleaned_post)

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(cleaned_posts, f, indent=4)

# Example usage (assuming you already have fetched posts)
save_preprocessed_to_csv(posts, 'cleaned_reddit_data.csv')
save_preprocessed_to_json(posts, 'cleaned_reddit_data.json')


In [None]:
q2 cleaning 

In [9]:
import nltk

# Download the punkt tokenizer
nltk.download('punkt')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
import csv
import re
from collections import Counter
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Cleans the input text by removing unwanted characters,
    normalizing whitespace, and lemmatizing words.
    
    Args:
        text (str): The input text to clean.
    
    Returns:
        str: The cleaned text.
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    words = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join the words back into a single string
    return ' '.join(words)

def preprocess_reddit_data(filename):
    """
    Reads Reddit data from a CSV file, cleans the posts and comments,
    and returns a DataFrame suitable for ICM analysis.
    
    Args:
        filename (str): The path to the CSV file containing Reddit data.
    
    Returns:
        pd.DataFrame: A DataFrame containing cleaned posts and comments.
    """
    # Load the data from CSV
    df = pd.read_csv(filename)

    # Clean the post titles and comments
    df['cleaned_post_title'] = df['post_title'].apply(clean_text)
    df['cleaned_comment_body'] = df['comment_body'].apply(clean_text)


    return df[['cleaned_post_title', 'cleaned_comment_body']]

# Example usage
reddit_data_filename = 'Q2/reddit_western.csv'
cleaned_reddit_data = preprocess_reddit_data(reddit_data_filename)

# Save cleaned data to a new CSV file
cleaned_reddit_data.to_csv('Q2/cleaned_reddit_western.csv', index=False)

print("Cleaned Reddit data saved to 'cleaned_reddit_western_inclusivity.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\joell/nltk_data'
    - 'C:\\Users\\joell\\anaconda3\\nltk_data'
    - 'C:\\Users\\joell\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\joell\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\joell\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
