In [15]:
import os
import json
import pandas as pd
import io
import datetime
import logging
import zstandard as zstd
import re 

In [16]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [17]:
def stream_zst_file(file_path):
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor(max_window_size=2147483648)  # 2 GB
        with dctx.stream_reader(fh) as reader:
            text_stream = io.TextIOWrapper(reader, encoding='utf-8')
            for line in text_stream:
                yield line

In [18]:
def read_keywords(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            keywords = [line.strip().lower().replace('_', ' ') for line in file]
        return keywords
    except Exception as e:
        logging.error(f"Error reading keywords file {file_path}: {e}")
        return []

In [19]:
def contains_keyword(content, keywords):
    pattern = r'\b(' + '|'.join([re.escape(keyword) for keyword in keywords]) + r')\b'
    return re.search(pattern, content) is not None

In [20]:
def filter_posts(file_path, subreddits, keywords):
    data = []
    for line in stream_zst_file(file_path):
        try:
            post = json.loads(line)
            title = post.get('title', '').lower()  
            content = post.get('selftext', '').lower() 

            if post.get('subreddit') in subreddits and (contains_keyword(title, keywords) or contains_keyword(content, keywords)):
                data.append({
                    'title': post.get('title', ''),
                    'author': post.get('author', ''),
                    'content': post.get('selftext', ''),
                    'id': post.get('id', ''),
                    'score': post.get('score', 0),
                    'created_utc': datetime.datetime.fromtimestamp(post.get('created_utc', 0)).strftime('%Y-%m-%d %H:%M:%S'),
                    'url': post.get('url', ''),
                    'num_comments': post.get('num_comments', 0),
                    'subreddit': post.get('subreddit', '')
                })
        except json.JSONDecodeError:
            logging.error(f"JSONDecodeError for line: {line}")
        except Exception as e:
            logging.error(f"Error processing line in {file_path}: {e}")
    return data


In [21]:
def main():
    directory_path = 'E:\\Torrents\\reddit\\submissions\\2023'
    subreddits = ['askSingapore', 'NTU', 'nus', 'SGExams', 'singapore', 'SingaporeRaw','NationalServiceSG']  
    keywords_file_path = "C:\\Users\\ntu-s\\OneDrive - Nanyang Technological University\\sherry\\extended_keywords_to keep [7jun].txt"

    keywords = read_keywords(keywords_file_path)
    print(keywords)

    for month in range(1, 13):  
        output_csv = f"C:\\Users\\ntu-s\\OneDrive - Nanyang Technological University\\sherry\\filtered_posts_{month:02}.csv" 
        all_data = []
        for filename in os.listdir(directory_path):
            if filename.startswith(f'RS_2023-{month:02}') and filename.endswith('.zst'):
                file_path = os.path.join(directory_path, filename)>
                posts = filter_posts(file_path, subreddits, keywords)
                all_data.extend(posts)


        df = pd.DataFrame(all_data)
        df.to_csv(output_csv, index=False)
        logging.info(f"Filtered data saved to '{output_csv}'.")

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    main()

['abandonment', 'abandoned', 'alcoholism', 'alcoholic', 'angst', 'anguish', 'anxieties', 'anxiety', 'anxiousness', 'anxious', 'bi polar disorder', 'bipolar depression', 'bipolar disorder', 'burnout', 'commit suicide', 'cruel', 'defeat', 'depressed', 'depressing', 'depression', 'depressive', 'depressive illness', 'depressive illnesses', 'desolation', 'despair', 'despondency', 'destructive', 'destruction', 'devastated', 'devastating', 'disappointed', 'disappointment', 'disappointing', 'disorder', 'distraught', 'disheartening', 'distressing', 'emptiness', 'fail', 'failed', 'failing', 'failure', 'failures', 'fear', 'frustrated', 'frustrating', 'grief', 'heartbreaking', 'hectic', 'helplessness', 'hopeless', 'hopelessness', 'hurt', 'hurts', 'hurting', 'inability', 'inconsequential', 'isolated', 'isolation', 'lonely', 'loneliness', 'loser', 'losers', 'meaningless', 'meaninglessness', 'mental illness', 'mental health', 'nerve racking', 'nerve wracking', 'nervousness', 'painful', 'pointless', '

2024-06-20 15:07:42,532 - INFO - Filtered data saved to 'C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\filtered_posts_01.csv'.
2024-06-20 15:27:30,279 - INFO - Filtered data saved to 'C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\filtered_posts_02.csv'.
2024-06-20 15:50:40,991 - INFO - Filtered data saved to 'C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\filtered_posts_03.csv'.
2024-06-20 16:18:43,259 - INFO - Filtered data saved to 'C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\filtered_posts_04.csv'.
2024-06-20 16:40:11,981 - INFO - Filtered data saved to 'C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\filtered_posts_05.csv'.
2024-06-20 17:05:55,479 - INFO - Filtered data saved to 'C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\filtered_posts_06.csv'.
2024-06-20 17:34:30,419 - INFO - Filtered data saved to 'C:\Users\ntu-s\OneDrive - Nanyang Technological University\sh