#### The cell below converts the raw bson type file straight from mongodump into a readable json. This json file is still in the default mongodb document format with types such as datetime and ObjectId

In [2]:
from bson.json_util import dumps
from bson import decode_file_iter
with open("out/posts-main.bson", "rb") as f:
  documents = list(decode_file_iter(f))
with open("./out/dump-raw.json", "w", encoding="utf-8") as f:
  f.write(dumps(documents, indent=2))

#### The cell below creates a function that convert each entry in the json data into raw json with primitive types

In [3]:
from bson import ObjectId
from datetime import datetime
def simplify(doc):
    clean = {}
    for k, v in doc.items():
        if isinstance(v, ObjectId):
            clean[k] = str(v)
        elif isinstance(v, datetime):
            clean[k] = v.isoformat()
        else:
            clean[k] = v
    return clean

#### The cell below iterates through each entry in the previous json data with mongodb specific data type into plain primitive json and writes it

In [4]:
from bson.json_util import loads
import json
with open("./out/dump-raw.json", "r", encoding="utf-8") as rawjson:
  rawjson = loads(rawjson.read())
  
rawjson = [simplify(data) for data in rawjson]
  
with open("./out/dump-formatted.json", "w", encoding="utf-8") as writejson:
  json.dump(rawjson, writejson, indent=2, ensure_ascii=False)
  

In [6]:
import re
import jaconv
from datetime import datetime
def cleantext(document):
  text = document["content"]
  text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
  
  text = re.sub(r"http\S+|www\S+", '', text)  # remove URLs
  # Replace all newline characters with space
  text = text.replace('\\n', ' ').replace('\\r', ' ')

  # After replacement, collapse multiple spaces again just in case
  text = re.sub(r'\s+', ' ', text).strip()

  text = text.strip()
  text = text.lower()
  document["content"] = text
  return document


def cleanStructure(document):
  newDocument = {
    "tweet_id": document["tweet_id"],
    "time": int(datetime.fromisoformat(document["time"]).timestamp()) ,
    "author": re.sub("@", "", document["author"]),
    "content": document["content"],
    "metadata": {
      "comment": document["comment_count"],
      "repost": document["repost_count"],
      "like": document["like_count"],
      "view": document["view_count"]
    }
  }
  return newDocument

In [7]:
import json
with open("./out/dump-formatted.json", "r", encoding="utf=8") as formattedjson:
  documentArray = json.load(formattedjson)
  
documentArray = [cleantext(doc) for doc in documentArray]
documentArray = [cleanStructure(doc) for doc in documentArray]

with open("./out/cleaned-formatted.json", "w", encoding="utf-8") as writejson:
  json.dump(documentArray, writejson, indent=2, ensure_ascii=False)

In [8]:
import json
with open("out/cleaned-formatted.json", "r", encoding="utf-8") as f:
  document = json.load(f)
  
  
with open("out/cleaned-formatted.jsonl", "a", encoding="utf-8") as f:
  for doc in document:
    f.write(json.dumps(doc) + "\n")