#DOWNLOAD DEPENDENCIES


In [1]:
# CELL 1: Install Dependencies
!pip install -q transformers torch accelerate bitsandbytes
!pip install -q networkx matplotlib pyvis
!pip install -q sentencepiece protobuf
!pip install -q compressed-tensors

/Users/mohak/.zshenv:1: bad assignment

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
/Users/mohak/.zshenv:1: bad assignment

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
/Users/mohak/.zshenv:1: bad assignment

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
/Users/mohak/.zshenv:1: bad assignment

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m

In [2]:
# CELL 2: Validate Imports
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("✓ All imports successful")
print(f"✓ CUDA available: {torch.cuda.is_available()}")
print(f"✓ Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

  from .autonotebook import tqdm as notebook_tqdm


✓ All imports successful
✓ CUDA available: False
✓ Device: CPU


In [7]:
root = ''

# BIOMISTRAL SETUP

In [None]:
# CELL: Matching Diagnostic Test Format
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

torch.cuda.empty_cache()

# Model setup - SWITCHED TO BIOMISTRAL
model_name = "biomistral/BioMistral-7B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"✓ Model loaded: {model_name}")

# Load data
with open('MAIN_DATA.json', 'r') as f:
    papers_data = json.load(f)

def prepare_papers(papers_data):
    papers = []
    for paper_id, paper_info in papers_data.items():
        title = paper_info.get('name', 'Untitled')
        chunks = paper_info.get('chunks', [])
        full_text = ' '.join(chunks)
        papers.append((paper_id, title, full_text))
    return papers

papers_list = prepare_papers(papers_data)
print(f"✓ Loaded {len(papers_list)} papers")

# FIXED inference - no system role, just user message
def extract_relationships_batch(papers_list, max_papers=None):
    if max_papers:
        papers_list = papers_list[:max_papers]
    
    results = []
    
    for idx, (paper_id, title, paper_text) in enumerate(papers_list):
        print(f"\n{'='*60}")
        print(f"Processing {idx+1}/{len(papers_list)} - Paper ID: {paper_id}")
        print(f"{'='*60}")
        
        try:
            # Truncate paper
            truncated_text = paper_text[:6000]
            
            # Build complete user content (system instructions + task)
            user_content = f"{EXTRACTION_TASK}{truncated_text}\n\nJSON output:"
            
            # FIXED: Only user role, no system role
            messages = [
                {
                    "role": "user",
                    "content": user_content
                }
            ]
            
            # Apply chat template
            inputs = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt",
            ).to(model.device)
            
            input_len = inputs["input_ids"].shape[-1]
            print(f"Input tokens: {input_len}")
            
            # Generate
            with torch.inference_mode():
                generation = model.generate(
                    **inputs,
                    max_new_tokens=512,
                    do_sample=False
                )
            
            # Extract only generated tokens
            generation = generation[0][input_len:]
            decoded = tokenizer.decode(generation, skip_special_tokens=True)
            
            result = {
                'paper_id': paper_id,
                'paper_title': title,
                'raw_output': decoded,
                'success': True
            }
            results.append(result)
            
            print(f"✓ Generated {len(decoded)} chars")
            print(f"Preview: {decoded[:300]}...")
            
        except Exception as e:
            print(f"❌ ERROR: {str(e)}")
            results.append({
                'paper_id': paper_id,
                'raw_output': None,
                'error': str(e),
                'success': False
            })
    
    return results

# Test
print("\n" + "="*60)
print("TESTING WITH 3 PAPERS")
print("="*60)
raw_results = extract_relationships_batch(papers_list, max_papers=3)

# Save results to file
output_file = 'extraction_data_biomistral.json'
with open(output_file, 'w') as f:
    json.dump(raw_results, f, indent=2)

print(f"\n✓ Results saved to {output_file}")

# Show results
print("\n" + "="*60)
print("RESULTS:")
print("="*60)
for i, result in enumerate(raw_results):
    print(f"\n--- Paper {i+1}: {result['paper_id']} ---")
    print(f"Success: {result['success']}")
    if result['success']:
        print(f"Output length: {len(result['raw_output'])}")
        print(f"Output:\n{result['raw_output'][:500]}...")

✓ Model loaded: biomistral/BioMistral-7B


FileNotFoundError: [Errno 2] No such file or directory: 'MAIN_DATA.json'

===========================================================================================================
#KIMI K2 TURBO VIA LAVA ROUTER
===========================================================================================================

In [None]:
import requests
import json
import base64
from google.colab import userdata
import time

# =============================================================================
# CONFIGURATION
# =============================================================================
LAVA_API_KEY = userdata.get('LAVA_API_KEY')
CONNECTION_SECRET = userdata.get('CONNECTION_SECRET')
PRODUCT_SECRET = userdata.get('PRODUCT_SECRET')

LAVA_FORWARD_URL = "https://api.lavapayments.com/v1/forward"
PROVIDER_API_URL = "https://api.moonshot.ai/v1/chat/completions"
MODEL_NAME = "kimi-k2-turbo-preview"

INPUT_FILE = root + "MAIN_DATA.json"  # Assumes file is in current directory
OUTPUT_FILE = "extraction_results.json"


# =============================================================================
# IMPLEMENTATION
# =============================================================================

def call_llm(user_message, max_tokens=2048, temperature=0.3):
    lava_url = f"{LAVA_FORWARD_URL}/{PROVIDER_API_URL}"

    lava_token = {"secret_key": LAVA_API_KEY}
    if CONNECTION_SECRET:
        lava_token["connection_secret"] = CONNECTION_SECRET
    if PRODUCT_SECRET:
        lava_token["product_secret"] = PRODUCT_SECRET

    encoded_token = base64.b64encode(
        json.dumps(lava_token).encode('utf-8')
    ).decode('utf-8')

    headers = {
        'Authorization': f'Bearer {encoded_token}',
        'Content-Type': 'application/json'
    }

    payload = {
        "model": MODEL_NAME,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "messages": [{"role": "user", "content": user_message}]
    }

    response = requests.post(lava_url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()

def extract_relationships(paper_text, paper_id):
    prompt = EXTRACTION_TASK + paper_text

    try:
        response = call_llm(prompt, max_tokens=4096, temperature=0.3)
        result = response['choices'][0]['message']['content']
        return {
            "paper_id": paper_id,
            "success": True,
            "extraction": result,
            "tokens_used": response.get('usage', {})
        }
    except Exception as e:
        return {
            "paper_id": paper_id,
            "success": False,
            "error": str(e)
        }

# =============================================================================
# PROCESS FILES
# =============================================================================

print("Loading data...")
with open(INPUT_FILE, 'r') as f:
    data = json.load(f)

# Debug: Check data structure
print(f"Data type: {type(data)}")
if isinstance(data, dict):
    print(f"Keys: {list(data.keys())[:5]}")
    # Convert dict to list of papers
    papers_list = list(data.values())
elif isinstance(data, list):
    papers_list = data
else:
    raise ValueError(f"Unexpected data type: {type(data)}")

print(f"Total papers available: {len(papers_list)}")

# Process first 3 papers as test
papers_to_process = papers_list[:3]
results = []

print(f"\nProcessing {len(papers_to_process)} papers...\n")

for i, paper in enumerate(papers_to_process, 1):
    print(f"Processing paper {i}/{len(papers_to_process)}...")

    # Get paper text (adjust based on your data structure)
    if 'chunks' in paper:
        paper_text = ' '.join(paper['chunks'])
    elif 'text' in paper:
        paper_text = paper['text']
    else:
        paper_text = str(paper)

    paper_id = paper.get('id', f'paper_{i}')

    # Extract relationships
    result = extract_relationships(paper_text, paper_id)
    results.append(result)

    print(f"  Status: {'Success' if result['success'] else 'Failed'}")
    if result['success']:
        print(f"  Tokens: {result['tokens_used']}")
    print()

    # Rate limiting
    if i < len(papers_to_process):
        time.sleep(1)

# Save results
print(f"Saving results to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, indent=2, fp=f)

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"Total papers processed: {len(results)}")
print(f"Successful: {sum(1 for r in results if r['success'])}")
print(f"Failed: {sum(1 for r in results if not r['success'])}")
print(f"Results saved to: {OUTPUT_FILE}")

# Display first result
if results and results[0]['success']:
    print("\n" + "="*50)
    print("SAMPLE EXTRACTION (Paper 1)")
    print("="*50)
    print(results[0]['extraction'])

Loading data...
Data type: <class 'dict'>
Keys: ['1', '2', '3', '4', '5']
Total papers available: 2026

Processing 3 papers...

Processing paper 1/3...
  Status: Success
  Tokens: {'prompt_tokens': 14367, 'completion_tokens': 213, 'total_tokens': 14580, 'cached_tokens': 14367}

Processing paper 2/3...
  Status: Success
  Tokens: {'prompt_tokens': 3495, 'completion_tokens': 41, 'total_tokens': 3536, 'cached_tokens': 3495}

Processing paper 3/3...
  Status: Success
  Tokens: {'prompt_tokens': 351, 'completion_tokens': 78, 'total_tokens': 429, 'cached_tokens': 351}

Saving results to extraction_results.json...

SUMMARY
Total papers processed: 3
Successful: 3
Failed: 0
Results saved to: extraction_results.json

SAMPLE EXTRACTION (Paper 1)
{"disease": "oral squamous cell carcinoma", "bacteria_relationships": [{"bacteria": "Fusobacterium nucleatum", "direction": "increased"}, {"bacteria": "Prevotella intermedia", "direction": "increased"}, {"bacteria": "Aggregatibacter segnis", "direction": 