<a href="https://colab.research.google.com/github/Ravhihz/Sentinanalytica/blob/main/Crawling_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === Mount Google Drive dan siapkan workspace ===
from google.colab import drive
drive.mount('/content/drive')

import os, pathlib, json, datetime as _dt, atexit, signal, threading

ROOT = "/content/drive/MyDrive/MBG_Scrape"   # lokasi utama
pathlib.Path(ROOT).mkdir(parents=True, exist_ok=True)
os.chdir(ROOT)
print("Working dir:", os.getcwd())

# === Masukkan token X ===
import os
os.environ["TWITTER_AUTH_TOKEN"] = "fc0c7f0994e119584e73dd08d0e652cd9f5e10ff"
assert os.getenv("TWITTER_AUTH_TOKEN"), "Set dulu os.environ['TWITTER_AUTH_TOKEN']"


# === Util umum untuk state (tahan crash) ===
def atomic_write_json(path, data):
    tmp = path + ".tmp"
    with open(tmp, "w") as f:
        json.dump(data, f, indent=2)
        f.flush(); os.fsync(f.fileno())
    os.replace(tmp, path)

def make_state(STATEF):
    state = {"done_files": [], "total_rows": 0, "last_run": None}
    def load():
        nonlocal state
        if os.path.exists(STATEF):
            with open(STATEF) as f: state = json.load(f)
            print(f"▶️ resume: {STATEF} | windows={len(state['done_files'])} | rows={state['total_rows']}")
        else:
            print(f"▶️ start fresh (no {STATEF})")
        return state
    def save():
        state["last_run"] = _dt.datetime.now(_dt.UTC).isoformat()
        atomic_write_json(STATEF, state)
        print(f"💾 checkpoint → {STATEF} | windows={len(state['done_files'])} | rows={state['total_rows']}")
    return state, load, save

def start_checkpoint_timer(save_fn, interval=300):
    def tick():
        try: save_fn()
        finally:
            t = threading.Timer(interval, tick); t.daemon = True; t.start()
    t = threading.Timer(interval, tick); t.daemon = True; t.start()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Working dir: /content/drive/MyDrive/MBG_Scrape


In [None]:
import os, datetime as dt, subprocess, shlex, pandas as pd, glob, random, time, shutil, re, pathlib
from pandas.errors import EmptyDataError, ParserError

# ====== STATE ======
STATEF = "state_text.json"
state, load_state, save_state = make_state(STATEF)
state = load_state()
start_checkpoint_timer(save_state, 300)

def _on_exit(*_):
    try: print("⚠️ runtime exiting (TEXT)…"); save_state()
    except: pass
atexit.register(_on_exit); signal.signal(signal.SIGTERM,_on_exit); signal.signal(signal.SIGINT,_on_exit)

# ====== PARAM ======
START = dt.date(2025, 1, 6)
END   = dt.date(2025, 10, 6)              # bisa juga: dt.date.today()
TODAY = dt.date.today()
SAFE_END = min(END, TODAY)                # jangan melewati hari ini

WINDOW_DAYS       = 3
PER_WINDOW_LIMIT  = 160
TAB               = "TOP"
OUTDIR            = "text_chunks"
MERGED            = "mbg_text_merged.csv"
ARCH_EMPTY        = "empty_windows"
pathlib.Path(OUTDIR).mkdir(exist_ok=True)
pathlib.Path(ARCH_EMPTY).mkdir(exist_ok=True)

BASE_QUERY = r'("makan bergizi gratis" OR "program makan bergizi" OR MBG OR "keracunan mbg") lang:id -is:retweet'

POSSIBLE_DIRS = [
    "/content/tweets-data",
    os.path.join(os.getcwd(), "tweets-data"),
    "/content/drive/MyDrive/MBG_Scrape/tweets-data",
]

# ====== HELPERS ======
def windows(start, end, step):
    d = start
    hard_until = min(end + dt.timedelta(days=1), TODAY + dt.timedelta(days=1))
    while d < hard_until:
        e = min(d + dt.timedelta(days=step), hard_until)
        yield d, e
        d = e

def _find_csv_after_run(base, retries=3, sleep_s=5):
    # cari file hasil tweet-harvest di banyak lokasi + retry (sinkronisasi Drive)
    patterns = []
    for d in POSSIBLE_DIRS:
        patterns += glob.glob(os.path.join(d, f"{base}.csv"))
        patterns += glob.glob(os.path.join(d, f"{base}*.csv"))
    patterns += glob.glob(f"{base}.csv")
    if patterns:
        patterns.sort(key=os.path.getmtime, reverse=True)
        return patterns[0]
    if retries <= 0:
        return None
    time.sleep(sleep_s)
    return _find_csv_after_run(base, retries-1, sleep_s)

def is_empty_csv(path: str) -> bool:
    try:
        if (not os.path.exists(path)) or os.path.getsize(path) < 32:
            return True
        pd.read_csv(path, nrows=1)   # peek
        return False
    except (EmptyDataError, ParserError, UnicodeDecodeError):
        return True
    except Exception:
        return True

def run_window(s, e, idx, base):
    out = f"{OUTDIR}/{base}.csv"
    if os.path.exists(out) or base in state.get("done_files", []):
        print(f"[T{idx:03d}] SKIP {out}")
        return out

    q = f'{BASE_QUERY} since:{s:%Y-%m-%d} until:{e:%Y-%m-%d}'
    cmd = f'npx -y tweet-harvest@2.6.1 -o "{base}" -s {shlex.quote(q)} --tab "{TAB}" -l {PER_WINDOW_LIMIT} --token "{os.environ["TWITTER_AUTH_TOKEN"]}"'
    print(f"\n[T{idx:03d}] {s}→{e}\n{q}")

    p = subprocess.run(cmd, shell=True, capture_output=True, text=True)

    if p.returncode == 0:
        # coba tangkap "saved to: ..." dari stdout/stderr
        m = re.search(r'saved to:\s+"?(.+?\.csv)"?', (p.stdout or "") + (p.stderr or ""), re.I)
        src = m.group(1) if m and os.path.exists(m.group(1)) else _find_csv_after_run(base)
        if src and os.path.exists(src):
            shutil.move(src, out)
            # validasi konten
            if is_empty_csv(out):
                print("  [WARN] kosong →", out, " (arsipkan)")
                shutil.move(out, os.path.join(ARCH_EMPTY, os.path.basename(out)))
                return None
            print("  [OK] ", out)
            return out

    print("  [WARN] CSV belum ketemu. Cek kemungkinan lokasi:"); [print("   -", d) for d in POSSIBLE_DIRS]
    return None

# ====== MAIN LOOP ======
files = sorted(glob.glob(f"{OUTDIR}/text_*.csv"))
for i, (s, e) in enumerate(windows(START, SAFE_END, WINDOW_DAYS), 1):
    base = f"text_{s:%Y%m%d}_{(e - dt.timedelta(days=1)):%Y%m%d}"
    out = run_window(s, e, i, base)
    time.sleep(random.randint(8, 15))
    if out and os.path.exists(out):
        try:
            if is_empty_csv(out):
                print("  [SKIP] CSV kosong:", out)
                shutil.move(out, os.path.join(ARCH_EMPTY, os.path.basename(out)))
                continue
            df = pd.read_csv(out)
            rows = len(df)
            # update state
            if base not in state["done_files"]:
                state["done_files"].append(base)
            state["total_rows"] = int(state.get("total_rows", 0) + rows)
            save_state()
            files.append(out)
            print(f"  [+] {rows} rows | total={state['total_rows']}")
        except EmptyDataError:
            print(f"  [WARN] EmptyDataError reading file: {out}")
            shutil.move(out, os.path.join(ARCH_EMPTY, os.path.basename(out)))
        except Exception as e:
            print(f"  [ERROR] {out}: {type(e).__name__} → {e}")

# ====== MERGE AMAN ======
parts = sorted([p for p in glob.glob(f"{OUTDIR}/text_*.csv") if not is_empty_csv(p)])
if parts:
    dfs = []
    for pth in parts:
        try:
            df = pd.read_csv(pth)
            if not df.empty:
                dfs.append(df)
        except Exception as e:
            print("[SKIP]", os.path.basename(pth), "→", type(e).__name__)
    if dfs:
        big = pd.concat(dfs, ignore_index=True)
        id_col = next((c for c in ["id_str","tweet_id","id","status_id"] if c in big.columns), None)
        if id_col:
            big = big.drop_duplicates(subset=[id_col])
        else:
            txt = "full_text" if "full_text" in big.columns else ("text" if "text" in big.columns else None)
            if txt: big = big.drop_duplicates(subset=[txt])
        big.to_csv(MERGED, index=False, encoding="utf-8")
        print(f"✅ MERGED → {MERGED} | rows={len(big)}")
    else:
        print("⚠️ Tidak ada DataFrame valid untuk merge.")
else:
    print("⚠️ Tidak ada CSV valid untuk digabung.")


▶️ resume: state_text.json | windows=96 | rows=44879
[T001] SKIP text_chunks/text_20250106_20250108.csv
💾 checkpoint → state_text.json | windows=96 | rows=45018
  [+] 139 rows | total=45018
[T002] SKIP text_chunks/text_20250109_20250111.csv
💾 checkpoint → state_text.json | windows=96 | rows=45157
  [+] 139 rows | total=45157
[T003] SKIP text_chunks/text_20250112_20250114.csv
💾 checkpoint → state_text.json | windows=96 | rows=45310
  [+] 153 rows | total=45310
[T004] SKIP text_chunks/text_20250115_20250117.csv
💾 checkpoint → state_text.json | windows=96 | rows=44879
💾 checkpoint → state_text.json | windows=96 | rows=45389
  [+] 79 rows | total=45389
[T005] SKIP text_chunks/text_20250118_20250120.csv
💾 checkpoint → state_text.json | windows=96 | rows=45531
  [+] 142 rows | total=45531
[T006] SKIP text_chunks/text_20250121_20250123.csv
💾 checkpoint → state_text.json | windows=96 | rows=45681
  [+] 150 rows | total=45681
[T007] SKIP text_chunks/text_20250124_20250126.csv
💾 checkpoint → sta