In [4]:
import os
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import re
import time
import math

load_dotenv()
INPUT_TXT_FILE = r'txt-Scraping\facebook_page_raws.txt'
OUTPUT_CSV_FILE = 'facebook_contacts_pages.csv'
BATCH_SIZE = 5

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url="https://api.opentyphoon.ai/v1"
)

def extract_contacts_from_batch(post_batch):
    system_prompt = """Extract contact info into JSON list with keys: name, contact_person, phone_number, line_id, email, location, summary, source_post_url. Output must be valid JSON list."""
    
    formatted_posts = "\n".join(f"--- POST {i} ---\n{post}\n" for i, post in enumerate(post_batch, 1))
    
    response = client.chat.completions.create(
        model="typhoon-v2.1-12b-instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Parse these {len(post_batch)} posts for contacts:\n{formatted_posts}"}
        ],
        max_tokens=4096,
        temperature=0.1
    )
    
    json_match = re.search(r'\[.*\]', response.choices[0].message.content, re.DOTALL)
    return json.loads(json_match.group()) if json_match else []

def main():
    if not os.path.exists(INPUT_TXT_FILE):
        return

    with open(INPUT_TXT_FILE, 'r', encoding='utf-8') as f:
        posts = [chunk for chunk in re.split(r'=============== POST #\d+ ===============\n', f.read()) if chunk.strip()]
    
    if not posts:
        return
        
    structured_data = []
    total_batches = math.ceil(len(posts) / BATCH_SIZE)
    
    for i in range(total_batches):
        batch = posts[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        if parsed := extract_contacts_from_batch(batch):
            structured_data.extend(parsed)

    if structured_data:
        cols = ["name", "contact_person", "phone_number", "line_id", "email", "location", "summary", "source_post_url"]
        pd.DataFrame(structured_data)[[c for c in cols if c in structured_data[0]]].to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8-sig')

if __name__ == "__main__":
    start = time.time()
    main()
    print(f"Execution time: {time.time()-start:.2f}s")

Execution time: 329.93s
