In [1]:
import zstandard
import os 
import json
import sys
import csv
from datetime import datetime
import logging.handlers


In [4]:

log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())

def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
    chunk = reader.read(chunk_size)
    bytes_read += chunk_size
    if previous_chunk is not None:
        chunk = previous_chunk + chunk
    try:
        return chunk.decode()
    except UnicodeDecodeError:
        if bytes_read > max_window_size:
            raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
        return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)

def read_lines_zst(file_name):
    with open(file_name, 'rb') as file_handle:
        buffer = ''
        reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
        while True:
            chunk = read_and_decode(reader, 2**27, (2**29) * 2)
            if not chunk:
                break
            lines = (buffer + chunk).split("\n")
            for line in lines[:-1]:
                yield line, file_handle.tell()
            buffer = lines[-1]
        reader.close()

if __name__ == "__main__":
    zst_file_name = 'RS_2021-01'
    input_file_path = '../../submissions/' + zst_file_name + '.zst'
    output_file_path = '../submissions/csv/2021/'+ zst_file_name +'.csv'

    # List of subreddits to filter by
    subreddits = ['NOLA', 'acadiana', 'centrallouisiana', 'houma', 'louisiana', 
                  'batonrouge', 'neworleans', 'Thibodaux', 'newyork', 'Adirondacks', 
                  'bronx', 'Brooklyn', 'Bushwick', 'hudsonvalley', 'OrangeCountyNY', 
                  'Queens', 'Westchester', 'Buffalo', 'ForestHills', 'longisland', 
                  'nyc', 'parkslope', 'williamsburg', 'newjersey', 'Hoboken', 
                  'jerseycity', 'Newark', 'SouthJersey', 'asburypark', 'NewBrunswickNJ', 
                  'texas', 'Austin', 'CorpusChristi', 'galveston', 'pflugerville', 
                  'sanantonio', 'corpus', 'Denton', 'houston', 'TriCitiesWA', 'Washington', 'oregon', 'Seattle', 'SeattleWA', 'Portland']

    fields = ['id', 'author', 'title', 'subreddit', 'created_utc', 'url', 'num_comments', 'score', 'selftext']

    file_size = os.stat(input_file_path).st_size
    file_lines = 0
    file_bytes_processed = 0
    bad_lines = 0

    with open(output_file_path, "w", encoding='utf-8', newline="") as output_file:
        writer = csv.writer(output_file)
        writer.writerow(fields)  # Write the header

        for line, file_bytes_processed in read_lines_zst(input_file_path):
            try:
                obj = json.loads(line)
                if obj.get('subreddit', '').lower() in (sub.lower() for sub in subreddits):
                    writer.writerow([str(obj.get(field, '')).replace("\n", " ") for field in fields])
            except json.JSONDecodeError as err:
                bad_lines += 1
            except KeyError as err:
                log.info(f"Missing field in line: {err}")
                bad_lines += 1

            file_lines += 1
            if file_lines % 100000 == 0:
                log.info(f"Processed {file_lines:,} lines: {bad_lines:,} bad lines : {(file_bytes_processed / file_size) * 100:.2f}% complete")

    log.info(f"Complete : {file_lines:,} lines processed with {bad_lines:,} bad lines.")

Processed 100,000 lines: 0 bad lines : 0.36% complete
Processed 100,000 lines: 0 bad lines : 0.36% complete
Processed 100,000 lines: 0 bad lines : 0.36% complete
Processed 200,000 lines: 0 bad lines : 0.71% complete
Processed 200,000 lines: 0 bad lines : 0.71% complete
Processed 200,000 lines: 0 bad lines : 0.71% complete
Processed 300,000 lines: 0 bad lines : 1.04% complete
Processed 300,000 lines: 0 bad lines : 1.04% complete
Processed 300,000 lines: 0 bad lines : 1.04% complete
Processed 400,000 lines: 0 bad lines : 1.26% complete
Processed 400,000 lines: 0 bad lines : 1.26% complete
Processed 400,000 lines: 0 bad lines : 1.26% complete
Processed 500,000 lines: 0 bad lines : 1.57% complete
Processed 500,000 lines: 0 bad lines : 1.57% complete
Processed 500,000 lines: 0 bad lines : 1.57% complete
Processed 600,000 lines: 0 bad lines : 1.91% complete
Processed 600,000 lines: 0 bad lines : 1.91% complete
Processed 600,000 lines: 0 bad lines : 1.91% complete
Processed 700,000 lines: 0 b

In [4]:

log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())

def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
    chunk = reader.read(chunk_size)
    bytes_read += chunk_size
    if previous_chunk is not None:
        chunk = previous_chunk + chunk
    try:
        return chunk.decode()
    except UnicodeDecodeError:
        if bytes_read > max_window_size:
            raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
        return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)

def read_lines_zst(file_name):
    with open(file_name, 'rb') as file_handle:
        buffer = ''
        reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
        while True:
            chunk = read_and_decode(reader, 2**27, (2**29) * 2)
            if not chunk:
                break
            lines = (buffer + chunk).split("\n")
            for line in lines[:-1]:
                yield line, file_handle.tell()
            buffer = lines[-1]
        reader.close()

if __name__ == "__main__":
    zst_file_name = 'RC_2021-08' 
    input_file_path = '../../comments/' +zst_file_name+ '.zst'
    output_file_path = '../comments/csv/'+zst_file_name+'.csv'
    subreddit_to_filter = 'austin'  # The subreddit you want to filter by
    subreddits = ['NOLA', 'acadiana', 'centrallouisiana', 'houma', 'louisiana', 
                  'batonrouge', 'neworleans', 'Thibodaux', 'newyork', 'Adirondacks', 
                  'bronx', 'Brooklyn', 'Bushwick', 'hudsonvalley', 'OrangeCountyNY', 
                  'Queens', 'Westchester', 'Buffalo', 'ForestHills', 'longisland', 
                  'nyc', 'parkslope', 'williamsburg', 'newjersey', 'Hoboken', 
                  'jerseycity', 'Newark', 'SouthJersey', 'asburypark', 'NewBrunswickNJ', 
                  'texas', 'Austin', 'CorpusChristi', 'galveston', 'pflugerville', 
                  'sanantonio', 'corpus', 'Denton', 'houston', 'TriCitiesWA', 'Washington', 'oregon', 'Seattle', 'SeattleWA', 'Portland']

    fields = ['id', 'author', 'body', 'link_id', 'parent_id', 'subreddit', 'subreddit_id', 'name', 'created_utc']

    file_size = os.stat(input_file_path).st_size
    file_lines = 0
    file_bytes_processed = 0
    bad_lines = 0

    with open(output_file_path, "w", encoding='utf-8', newline="") as output_file:
        writer = csv.writer(output_file)
        writer.writerow(fields)  # Write the header

        for line, file_bytes_processed in read_lines_zst(input_file_path):
            try:
                obj = json.loads(line)
                if obj.get('subreddit').lower() in (sub.lower() for sub in subreddits):
                    writer.writerow([str(obj.get(field, '')).replace("\n", " ") for field in fields])
            except json.JSONDecodeError as err:
                bad_lines += 1
            except KeyError as err:
                log.info(f"Missing field in line: {err}")
                bad_lines += 1

            file_lines += 1
            if file_lines % 100000 == 0:
                log.info(f"Processed {file_lines:,} lines: {bad_lines:,} bad lines : {(file_bytes_processed / file_size) * 100:.2f}% complete")

    log.info(f"Complete : {file_lines:,} lines processed with {bad_lines:,} bad lines.")


Processed 100,000 lines: 0 bad lines : 0.09% complete
Processed 100,000 lines: 0 bad lines : 0.09% complete
Processed 200,000 lines: 0 bad lines : 0.13% complete
Processed 200,000 lines: 0 bad lines : 0.13% complete
Processed 300,000 lines: 0 bad lines : 0.18% complete
Processed 300,000 lines: 0 bad lines : 0.18% complete
Processed 400,000 lines: 0 bad lines : 0.22% complete
Processed 400,000 lines: 0 bad lines : 0.22% complete
Processed 500,000 lines: 0 bad lines : 0.26% complete
Processed 500,000 lines: 0 bad lines : 0.26% complete
Processed 600,000 lines: 0 bad lines : 0.30% complete
Processed 600,000 lines: 0 bad lines : 0.30% complete
Processed 700,000 lines: 0 bad lines : 0.34% complete
Processed 700,000 lines: 0 bad lines : 0.34% complete
Processed 800,000 lines: 0 bad lines : 0.39% complete
Processed 800,000 lines: 0 bad lines : 0.39% complete
Processed 900,000 lines: 0 bad lines : 0.43% complete
Processed 900,000 lines: 0 bad lines : 0.43% complete
Processed 1,000,000 lines: 0

In [6]:
import pandas as pd
test_data = pd.read_csv('data/comments/comments_2021-11.csv', nrows=1000)
test_sub_data = pd.read_csv('reddit/submissions/RS_2021-11.csv', nrows=1000)


In [2]:
test_data.to_csv('data/comments/comments_2021-11_min.csv', index=False)



In [7]:
test_sub_data.to_csv('reddit/submissions/submissions_2021-11_min.csv', index=False)

In [5]:
subreddits = ['NOLA', 'acadiana', 'centrallouisiana', 'houma', 'louisiana', 
                  'batonrouge', 'neworleans', 'Thibodaux', 'newyork', 'Adirondacks', 
                  'bronx', 'Brooklyn', 'Bushwick', 'hudsonvalley', 'OrangeCountyNY', 
                  'Queens', 'Westchester', 'Buffalo', 'ForestHills', 'longisland', 
                  'nyc', 'parkslope', 'williamsburg', 'newjersey', 'Hoboken', 
                  'jerseycity', 'Newark', 'SouthJersey', 'asburypark', 'NewBrunswickNJ', 
                  'texas', 'Austin', 'CorpusChristi', 'galveston', 'pflugerville', 
                  'sanantonio', 'corpus', 'Denton', 'houston']

# all items of subreddits list are converted to lower case
subreddits = [item.lower() for item in subreddits]

# write the updated list to a text file
with open('subreddits.txt', 'w') as filehandle:
    for listitem in subreddits:
        filehandle.write('%s\n' % listitem)