In [None]:
import pandas as pd

df = pd.read_json("dataset/df_clean.json", lines=True)  
print(df.shape)
print(df.head())


(37120, 37)
                               name  \
0                             Parsr   
1  Yet-Another-EfficientDet-Pytorch   
2                               nit   
3                           new.css   
4                      golang-notes   

                                         description  stargazerCount  \
0  Transforms PDF, Documents and Images into Enri...            5755   
1  The pytorch re-implement of the official effic...            5200   
2                                 Git of Web3 assets            4452   
3  A classless CSS framework to write modern webs...            3946   
4                     Go source code analysis(zh-cn)            3941   

   forkCount            createdAt            updatedAt             pushedAt  \
0        306  2019-08-05T12:43:53  2024-09-01T17:24:35  2023-12-03T13:27:21   
1       1268  2020-04-06T03:27:06  2024-08-22T08:04:26  2021-10-24T02:13:31   
2          8  2022-04-20T08:21:14  2024-08-08T16:18:20  2024-07-29T14:17:11   
3   

In [4]:
# STEP 1 — Robust JSON peek & sample load (single cell)

import os, json, itertools
import pandas as pd

# --- 0) Config ---
PATH = "dataset/df_clean.json"   # <- your file path
EXPECTED_ROWS = 37000            # for memory projection only

assert os.path.exists(PATH), f"File not found: {PATH}"

# --- 1) File info ---
size_bytes = os.path.getsize(PATH)
size_mb = size_bytes / (1024**2)
print(f"[INFO] File size: {size_mb:.2f} MB")

# --- 2) Quick raw peek (first 5 lines) ---
with open(PATH, 'r', encoding='utf-8') as f:
    head_lines = [line.rstrip('\n') for line in itertools.islice(f, 5)]
print("\n[PEEK] First lines (truncated to 160 chars):")
for i, ln in enumerate(head_lines, 1):
    print(f"{i:02d}: {ln[:160]}{'...' if len(ln)>160 else ''}")

# --- 3) Heuristic: JSON Lines vs JSON array ---
first_non_ws_char = None
with open(PATH, 'r', encoding='utf-8') as f:
    while True:
        ch = f.read(1)
        if not ch or not ch.isspace():
            first_non_ws_char = ch
            break

is_jsonl = (first_non_ws_char == '{')   # typical for JSONL: one object per line
is_array = (first_non_ws_char == '[')   # typical for a JSON array

print(f"\n[DETECT] Format: {'JSON Lines' if is_jsonl else 'JSON array' if is_array else 'Unknown'}")

# --- 4) Sample load (safe) ---
if is_jsonl:
    # Read a first chunk only; fast schema check; avoids full memory load
    chunk_iter = pd.read_json(PATH, lines=True, chunksize=20000)
    df_sample = next(chunk_iter)
else:
    # Likely a single JSON array or dict; ok for ~37k rows
    with open(PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if isinstance(data, list):
        df_sample = pd.json_normalize(data)
    elif isinstance(data, dict):
        # Try to find the first list-like key if present
        list_keys = [k for k, v in data.items() if isinstance(v, list)]
        key = list_keys[0] if list_keys else None
        base = data if key is None else data[key]
        df_sample = pd.json_normalize(base)
    else:
        raise ValueError("Unexpected JSON structure: not a list or dict.")

print(f"\n[SAMPLE] shape={df_sample.shape}")
print("\n[SAMPLE HEAD — transposed for readability]\n", df_sample.head(3).T)

# --- 5) Memory projection for full dataset ---
mem_sample = df_sample.memory_usage(deep=True).sum()
rows = len(df_sample)
if rows > 0:
    bpr = mem_sample / rows
    est_total_bytes = bpr * (EXPECTED_ROWS or rows)
    print(f"\n[MEM] ~{bpr/1024:.1f} KB per row (sample)")
    print(f"[MEM] Projected for {EXPECTED_ROWS:,} rows: ~{est_total_bytes/(1024**2):.1f} MB")

# --- 6) Persist columns for manual review (optional) ---
df_sample.columns.to_series().to_csv("schema_columns.csv", index=False)
print("\n[OK] Saved column names to schema_columns.csv")


[INFO] File size: 34.51 MB

[PEEK] First lines (truncated to 160 chars):
01: {"name":"Parsr","description":"Transforms PDF, Documents and Images into Enriched Structured Data","stargazerCount":5755,"forkCount":306,"createdAt":"2019-08-05...
02: {"name":"Yet-Another-EfficientDet-Pytorch","description":"The pytorch re-implement of the official efficientdet with SOTA performance in real time and pretraine...
03: {"name":"nit","description":"Git of Web3 assets","stargazerCount":4452,"forkCount":8,"createdAt":"2022-04-20T08:21:14","updatedAt":"2024-08-08T16:18:20","pushed...
04: {"name":"new.css","description":"A classless CSS framework to write modern websites using only HTML.","stargazerCount":3946,"forkCount":127,"createdAt":"2020-05...
05: {"name":"golang-notes","description":"Go source code analysis(zh-cn)","stargazerCount":3941,"forkCount":667,"createdAt":"2018-04-04T05:21:51","updatedAt":"2024-...

[DETECT] Format: JSON Lines

[SAMPLE] shape=(20000, 37)

[SAMPLE HEAD — transposed for