Skip to content

Commit

Permalink
truncate body for post.jsonl if needed
Browse files Browse the repository at this point in the history
  • Loading branch information
RoCry committed Jun 13, 2022
1 parent 5303c6e commit 5563cd8
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def _split_file(path: str, chunk_count: int):
while os.path.exists(".".join(comps[:-1]) + f".{offset}.json"):
offset += 1
for i, lines in enumerate(chunks(total_lines, chunk_size)):
new_path = ".".join(comps[:-1]) + f".{i+offset}.json"
new_path = ".".join(comps[:-1]) + f".{i + offset}.json"
with open(new_path, "a") as f: # append to file
f.writelines(lines)
os.remove(path)
Expand Down Expand Up @@ -195,5 +195,16 @@ def append_to_file(self, key: int, path: str, dicts: list[dict]):
) as hf:
for d in dicts:
s = json.dumps(d, ensure_ascii=False)
f.write(s + "\n")
hf.write(s + "\n")

# truncate body if needed
if "body" in d:
body = d["body"].split("\n")
if len(body) > 800:
logger.info(
f"Truncated body of id: {d['id']}, title: {d['title']} from {len(body)}"
)
d["body"] = "\n".join(body[:400])
s = json.dumps(d, ensure_ascii=False)

f.write(s + "\n")

0 comments on commit 5563cd8

Please sign in to comment.