In [None]:
#https://www.reddit.com/r/pushshift/comments/11ef9if/separate_dump_files_for_the_top_20k_subreddits/
#License: No license specified, the work may be protected by copyright.

#https://academictorrents.com/details/c398a571976c78d346c325bd75c47b82edf6124e
#https://github.com/Watchful1/PushshiftDumps

In [1]:
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers

In [2]:
from pathlib import Path
print(Path.cwd())

/Users/selinzobu/Desktop/1SCRAPE DATA


In [None]:
#BEFORE 2017-03-01 - 2019-12-31   13910 submissions
#AFTER  2020-01-01 - 2021-02-28   14112 submissions

In [45]:
# put the path to the input file
input_file = "/Users/selinzobu/Desktop/1SCRAPE DATA/insomnia_submissions.zst"
# put the name or path to the output file. 
output_file = "/Users/selinzobu/Desktop/1SCRAPE DATA/11BEFORECOinsomnia_submissions"
output_format = "csv"
is_submission = "submission" in input_file

In [46]:
# only output items between these two dates
from_date = datetime.strptime("2017-03-01", "%Y-%m-%d")
to_date = datetime.strptime("2019-12-31", "%Y-%m-%d")

In [19]:
# the field to filter on, the values to filter with and whether it should be an exact match
# some examples:
#
# return only objects where the author is u/watchful1 or u/spez
# field = "author"
# values = ["watchful1","spez"]
# exact_match = True
#
# return only objects where the title contains either "stonk" or "moon"
# field = "title"
# values = ["stonk","moon"]
# exact_match = False
#
# return only objects where the body contains either "stonk" or "moon". For submissions the body is in the "selftext" field, for comments it's in the "body" field
#field = "selftext"
#values = ["sleep","insomnia"]
#exact_match = False

In [30]:
# sets up logging to the console as well as a file
log = logging.getLogger("bot")
log.setLevel(logging.INFO)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
log_str_handler = logging.StreamHandler()
log_str_handler.setFormatter(log_formatter)
log.addHandler(log_str_handler)
if not os.path.exists("logs"):
	os.makedirs("logs")
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
log_file_handler.setFormatter(log_formatter)
log.addHandler(log_file_handler)

In [31]:
def write_line_zst(handle, line):
	handle.write(line.encode('utf-8'))
	handle.write("\n".encode('utf-8'))

def write_line_json(handle, obj):
	handle.write(json.dumps(obj))
	handle.write("\n")

In [32]:
def write_line_csv(writer, obj, is_submission):
	output_list = []
	output_list.append(str(obj['score']))
	output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
	if is_submission:
		output_list.append(obj['title'])
	output_list.append(f"u/{obj['author']}")
	output_list.append(f"https://www.reddit.com{obj['permalink']}")
	if is_submission:
		if obj['is_self']:
			if 'selftext' in obj:
				output_list.append(obj['selftext'])
			else:
				output_list.append("")
		else:
			output_list.append(obj['url'])
	else:
		output_list.append(obj['body'])
	writer.writerow(output_list)

In [33]:
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)

In [34]:
def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line.strip(), file_handle.tell()

			buffer = lines[-1]

		reader.close()

In [47]:
if __name__ == "__main__":
	output_path = f"{output_file}.{output_format}"

	writer = None
	if output_format == "zst":
		handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
	elif output_format == "txt":
		handle = open(output_path, 'w', encoding='UTF-8')
	elif output_format == "csv":
		handle = open(output_path, 'w', encoding='UTF-8', newline='')
		writer = csv.writer(handle)
	else:
		log.error(f"Unsupported output format {output_format}")
		sys.exit()

#	values = [value.lower() for value in values]  # convert to lowercase

	file_size = os.stat(input_file).st_size
	file_bytes_processed = 0
	created = None
#	matched_lines = 0
	bad_lines = 0
	total_lines = 0
	for line, file_bytes_processed in read_lines_zst(input_file):
		total_lines += 1
		if total_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")

		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))

			if created < from_date:
				continue
			if created > to_date:
				continue

#			field_value = obj[field].lower()
#			matched = False
#			for value in values:
#				if exact_match:
#					if value == field_value:
#						matched = True
#						break
#				else:
#					if value in field_value:
#						matched = True
#						break
#			if not matched:
#				continue

#			matched_lines += 1
			if output_format == "zst":
				write_line_zst(handle, line)
			elif output_format == "csv":
				write_line_csv(writer, obj, is_submission)
			elif output_format == "txt":
				write_line_json(handle, obj)
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1

	handle.close()
	log.info(f"Complete : {total_lines:,} : {bad_lines:,}")
    
    
#2023-03-13 20:58:12,808 - INFO: Complete : 55,726 : 0 submissions
#2023-03-13 21:05:49,852 - INFO: Complete : 340,130 : 0 comments

2023-03-26 19:58:22,936 - INFO: Complete : 55,726 : 0
2023-03-26 19:58:22,936 - INFO: Complete : 55,726 : 0
2023-03-26 19:58:22,936 - INFO: Complete : 55,726 : 0
