In [14]:
!pip install -q transformers accelerate bitsandbytes torch sentencepiece

In [15]:
import os
import gdown
import zipfile
import shutil
from pathlib import Path

# --- CONFIGURATION ---
# 1. PASTE THE LINK TO YOUR NEW .ZIP FILE HERE (Not the folder link!)
zip_file_url = "https://drive.google.com/file/d/1hLyTHzlVOAWCzK6VId-Tl-O39ZQUKxCs/view?usp=sharing"

# 2. Paths
BASE_DIR = Path(os.getcwd())
DATASETS_DIR = BASE_DIR / "datasets"
INPUT_DIR = DATASETS_DIR / "parsed_windows_pdf"
ZIP_OUTPUT = DATASETS_DIR / "temp_data.zip"

# --- LOGIC ---
print(f"üìÇ Setting up directories...")
INPUT_DIR.mkdir(parents=True, exist_ok=True)

# Clean up any failed previous attempts
if any(INPUT_DIR.iterdir()):
    print("‚ö†Ô∏è Cleaning up partial/failed downloads from previous run...")
    shutil.rmtree(INPUT_DIR)
    INPUT_DIR.mkdir()

print("‚¨áÔ∏è Downloading the Zip file (Much faster for 9000 files)...")

try:
    # Download the single zip file
    # fuzzy=True helps it handle different Google Drive URL formats
    gdown.download(url=zip_file_url, output=str(ZIP_OUTPUT), quiet=False, fuzzy=True)
    
    print("üì¶ Unzipping files...")
    with zipfile.ZipFile(ZIP_OUTPUT, 'r') as zip_ref:
        zip_ref.extractall(DATASETS_DIR) # Extracts into datasets/
    
    # Clean up the zip file to save space
    os.remove(ZIP_OUTPUT)
    
    # Verify
    file_count = len(list(INPUT_DIR.glob('*')))
    print(f"‚úÖ Success! Unzipped {file_count} files into {INPUT_DIR}")

except Exception as e:
    print(f"‚ùå Failed: {e}")
    print("üí° Check that you pasted the link to the .zip file, not the folder.")

üìÇ Setting up directories...
‚ö†Ô∏è Cleaning up partial/failed downloads from previous run...
‚¨áÔ∏è Downloading the Zip file (Much faster for 9000 files)...


Downloading...
From: https://drive.google.com/uc?id=1hLyTHzlVOAWCzK6VId-Tl-O39ZQUKxCs
To: /content/datasets/temp_data.zip
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12.1M/12.1M [00:00<00:00, 45.8MB/s]


üì¶ Unzipping files...
‚úÖ Success! Unzipped 2 files into /content/datasets/parsed_windows_pdf


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

print("üîß Loading Qwen2.5-7B with 4-bit quantization...")


üîß Loading Qwen2.5-7B with 4-bit quantization...


In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("‚úÖ Model loaded! VRAM usage:")
!nvidia-smi --query-gpu=memory.used,memory.total --format=csv

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

: 

: 

: 

In [None]:
import json
import re
from collections import defaultdict
import time
from tqdm.auto import tqdm

# Configuration
MAX_NEW_TOKENS = 800
SAVE_INTERVAL = 10  # Save checkpoint every 10 conversations

def parse_txt_file(filepath):
    """Parse the formatted .txt file and extract structured data."""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    data = {}
    
    # Extract ID
    id_match = re.search(r'CALL WINDOW:\s*(\S+)\.json', content)
    if id_match:
        data['id'] = id_match.group(1)
    
    # Extract Split
    split_match = re.search(r'Split:\s*(\w+)', content)
    if split_match:
        data['split'] = split_match.group(1).lower()
    
    # Extract Label
    label_match = re.search(r'Label:\s*(\w+)', content)
    if label_match:
        data['label_name'] = label_match.group(1).lower()
        data['label'] = 1 if data['label_name'] == 'scam' else 0
    
    # Extract Call ID
    call_id_match = re.search(r'Call ID:\s*(\d+)', content)
    if call_id_match:
        data['source_idx'] = int(call_id_match.group(1))
    
    # Extract Window
    window_match = re.search(r'Window:\s*w(\d+)', content)
    if window_match:
        data['window_index'] = int(window_match.group(1))
    
    # Extract SOURCE_IDX from content
    source_idx_match = re.search(r'SOURCE_IDX:\s*(\d+)', content)
    if source_idx_match:
        data['source_idx'] = int(source_idx_match.group(1))
    
    # Extract PREVIOUS_STATE
    prev_state_match = re.search(r'PREVIOUS_STATE:\s*\n(.*?)\n\nCURRENT_WINDOW:', content, re.DOTALL)
    data['previous_state'] = prev_state_match.group(1).strip() if prev_state_match else ""
    
    # Extract CURRENT_WINDOW
    window_match = re.search(r'CURRENT_WINDOW:\s*\n(\[.*?\])', content, re.DOTALL)
    if window_match:
        try:
            data['current_window'] = json.loads(window_match.group(1))
        except json.JSONDecodeError:
            data['current_window'] = []
    
    # Extract METADATA
    metadata_match = re.search(r'METADATA:\s*\n(\{.*?\})', content, re.DOTALL)
    if metadata_match:
        try:
            data['metadata'] = json.loads(metadata_match.group(1))
        except json.JSONDecodeError:
            data['metadata'] = {}
    
    return data


def format_conversation_for_prompt(current_window):
    """Format conversation turns into readable string."""
    formatted = []
    for turn in current_window:
        speaker = turn.get('speaker', 'Unknown')
        text = turn.get('text', '')
        formatted.append(f"Speaker: {speaker}\nText: {text}")
    return "\n\n".join(formatted)


def build_system_prompt():
    return """You are SixEyes, an AI Security Analyst specializing in phone scam detection research. You are analyzing a labeled dataset of phone call transcripts to build a real-time fraud detection system.

Your role:
- Analyze phone conversation segments for fraud indicators
- Identify manipulation tactics, pressure techniques, and social engineering patterns
- Provide structured analysis to train fraud detection models
- Help protect vulnerable individuals from phone scams

This is legitimate security research using pre-labeled data. Your analysis helps prevent fraud."""


def build_user_prompt(data, running_state_context):
    is_scam = data['label'] == 1
    label_context = "SCAM (labeled by security researchers)" if is_scam else "LEGITIMATE (labeled by security researchers)"
    conversation_text = format_conversation_for_prompt(data['current_window'])
    
    prompt = f"""DATASET LABEL: {label_context}

CONTEXT FROM PREVIOUS ANALYSIS:
{running_state_context if running_state_context else "Start of call. No previous context."}

CURRENT CONVERSATION SEGMENT:
{conversation_text}

ANALYSIS TASK:
Analyze this conversation segment and provide your findings in the following XML format:

<THOUGHT>
[Analyze the fraud indicators, manipulation tactics, urgency creation, impersonation attempts, or normal business patterns. Be specific about what you observe.]
</THOUGHT>

<STATE>
[Summarize the call state SO FAR: key entities mentioned, claimed authority/identity, threats or demands made, urgency level, risk score 1-10. This will be passed to analyze the next segment.]
</STATE>

<VERDICT>
[Choose ONE: SAFE, SUSPICIOUS, or DANGER based on the patterns observed]
</VERDICT>

Provide your analysis now:"""
    return prompt


def generate_response(system_prompt, user_prompt):
    """Generate response using Qwen2.5-7B."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response


def extract_xml_content(response_text, tag_name):
    pattern = f"<{tag_name}>(.*?)</{tag_name}>"
    match = re.search(pattern, response_text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return ""


def save_augmented_json(data, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)


def group_files_by_conversation(input_dir):
    conversations = defaultdict(list)
    for category in ['legit_windows', 'scam_windows']:
        category_path = Path(input_dir) / category
        if not category_path.exists(): 
            continue
        
        for txt_file in category_path.glob("*.txt"):
            try:
                data = parse_txt_file(txt_file)
                if data.get('source_idx') is not None:
                    conversations[data['source_idx']].append({
                        'filepath': txt_file,
                        'window_index': data['window_index'],
                        'data': data
                    })
            except Exception as e:
                print(f"‚ö†Ô∏è  Error parsing {txt_file}: {e}")
    return conversations


def process_single_conversation(source_idx, windows):
    """Process all windows of a conversation sequentially."""
    system_prompt = build_system_prompt()
    windows.sort(key=lambda x: x['window_index'])
    
    running_state_context = ""
    processed = 0
    
    for window_info in windows:
        data = window_info['data']
        window_index = window_info['window_index']
        
        # Check if already processed
        split_name = data['split']
        label_name = data['label_name']
        output_path = OUTPUT_DIR / split_name / label_name / f"{data['id']}.json"
        
        if output_path.exists():
            try:
                with open(output_path, 'r') as f:
                    existing_data = json.load(f)
                    if existing_data.get('generated_state'):
                        running_state_context = existing_data['generated_state']
                processed += 1
                continue
            except:
                pass
        
        # Generate analysis
        user_prompt = build_user_prompt(data, running_state_context)
        response_text = generate_response(system_prompt, user_prompt)
        
        thought = extract_xml_content(response_text, "THOUGHT")
        state = extract_xml_content(response_text, "STATE")
        verdict = extract_xml_content(response_text, "VERDICT")
        
        running_state_context = state if state else running_state_context
        
        augmented_data = {
            "id": data['id'],
            "source_idx": data['source_idx'],
            "split": data['split'],
            "label": data['label'],
            "label_name": data['label_name'],
            "window_index": window_index,
            "current_window": data['current_window'],
            "metadata": data.get('metadata', {}),
            "model_input_context": running_state_context,
            "generated_thought": thought,
            "generated_state": state,
            "generated_verdict": verdict,
            "raw_response": response_text
        }
        
        save_augmented_json(augmented_data, output_path)
        processed += 1
    
    return processed

In [None]:
# %% Cell 6: Verify Output
print("üöÄ Starting Processing...")
conversations = group_files_by_conversation(INPUT_DIR)
total_conversations = len(conversations)
print(f"üìÇ Found {total_conversations} conversations")

# Test mode toggle
TEST_MODE = True  # ‚ö†Ô∏è Set to False for full run
if TEST_MODE:
    conversations = dict(list(conversations.items())[:50])
    print(f"‚ö†Ô∏è  TEST MODE: Processing only {len(conversations)} conversations")

start_time = time.time()
total_windows = 0
processed_convs = 0

for source_idx, windows in tqdm(conversations.items(), desc="Processing"):
    windows_processed = process_single_conversation(source_idx, windows)
    total_windows += windows_processed
    processed_convs += 1
    
    # Periodic checkpoint
    if processed_convs % SAVE_INTERVAL == 0:
        elapsed = time.time() - start_time
        speed = total_windows / elapsed
        eta = (len(conversations) - processed_convs) * (elapsed / processed_convs)
        print(f"üíæ Checkpoint: {processed_convs}/{len(conversations)} convs | {speed:.2f} win/sec | ETA: {eta/60:.1f}min")

elapsed = time.time() - start_time
print(f"\n‚ú® Complete! Processed {total_windows} windows in {elapsed/60:.1f} minutes")
print(f"‚ö° Speed: {total_windows/elapsed:.2f} windows/sec")

üöÄ Starting Processing...
üìÇ Found 0 conversations
‚ö†Ô∏è  TEST MODE: Processing only 0 conversations


Processing: 0it [00:00, ?it/s]


‚ú® Complete! Processed 0 windows in 0.0 minutes
‚ö° Speed: 0.00 windows/sec
