In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import re
import pandas as pd
import numpy as np
from typing import Dict, Any, Optional, List
import random

# Initialize model and tokenizer
model_id = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="cpu")

# Set pad token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def load_ndk_datasets():
    """Load and link the NDK datasets"""
    # Load both datasets
    image_reviews = pd.read_csv('../data/image-index-NDK.final.csv')
    text_reviews = pd.read_csv('../data/out-NDK-final.csv')
    
    print(f"Loaded {len(image_reviews)} reviews with images")
    print(f"Loaded {len(text_reviews)} text-only reviews")
    
    # Clean and prepare data
    image_reviews = image_reviews.dropna(subset=['text'])
    text_reviews = text_reviews.dropna(subset=['text'])
    
    # Add source column to track origin
    image_reviews['source'] = 'with_image'
    text_reviews['source'] = 'text_only'
    
    return image_reviews, text_reviews

def create_classification_prompt(place_description: str, review_text: str) -> str:
    """
    Create a structured prompt for review classification based on three standards:
    1. Advertisement detection
    2. Relevance to place
    3. Rant without visit
    """
    
    system_prompt = """You are an expert review classifier. Your task is to classify reviews about places based on three important standards:

1. **Advertisement**: Reviews that promote unrelated products/services, contain promotional links, or are clearly marketing content
2. **Irrelevant**: Reviews about wrong locations, general complaints not specific to the establishment, or content unrelated to the place's business  
3. **Rant without visit**: Reviews showing no evidence of actual visit, generic complaints, or purely emotional outbursts without specific place details
4. **Legitimate**: Reviews that appear to be genuine experiences at the place

CLASSIFICATION CRITERIA:
- Advertisement: Contains promotional content, URLs, unrelated product mentions, marketing language
- Irrelevant: Wrong location, off-topic content, not about the place's services/products
- Rant without visit: No specific details, generic complaints, no evidence of actual experience
- Legitimate: Specific details about the place, genuine experience indicators, relevant content

OUTPUT FORMAT: Respond with ONLY valid JSON in this exact format:
{
  "classification": "advertisement|irrelevant|rant_without_visit|legitimate",
  "confidence": 0.0-1.0,
  "reasoning": "brief explanation",
  "indicators": ["list", "of", "key", "indicators", "found"]
}"""

    few_shot_examples = """
EXAMPLES:

Place: Restaurant | Review: "Cheap pizza! Click www.pizzapromotion.com for discount!"
{"classification": "advertisement", "confidence": 0.95, "reasoning": "Contains promotional URL and marketing language unrelated to restaurant", "indicators": ["promotional_url", "marketing_language", "unrelated_product"]}

Place: Hospital | Review: "The food was terrible and service was slow"  
{"classification": "irrelevant", "confidence": 0.85, "reasoning": "Discusses restaurant qualities at a hospital", "indicators": ["wrong_service_type", "mismatched_expectations"]}

Place: Hotel | Review: "This place sucks! Worst ever! Never going back!"
{"classification": "rant_without_visit", "confidence": 0.80, "reasoning": "Generic negative language without specific hotel experience details", "indicators": ["generic_complaints", "no_specific_details", "emotional_language"]}

Place: Restaurant | Review: "Had the salmon special yesterday. Great taste but portion was small for $25. Server was friendly."
{"classification": "legitimate", "confidence": 0.90, "reasoning": "Specific menu item, price, service details indicate genuine visit", "indicators": ["specific_menu_item", "price_mention", "service_details", "recent_visit"]}"""

    user_prompt = f"""
Now classify this review:

Place Description: {place_description}
Review Text: {review_text}

Respond with JSON only:"""

    return f"{system_prompt}\n\n{few_shot_examples}\n\n{user_prompt}"

def extract_json_from_response(response: str) -> Optional[Dict[str, Any]]:
    """Extract JSON from model response with error handling"""
    try:
        # Try to find JSON in the response
        json_match = re.search(r'\{.*\}', response, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
            return json.loads(json_str)
        return None
    except json.JSONDecodeError:
        return None

def classify_review(place_description: str, review_text: str, max_retries: int = 3) -> Dict[str, Any]:
    """
    Classify a review using the three standards with retry logic
    """
    prompt = create_classification_prompt(place_description, review_text)
    
    for attempt in range(max_retries):
        try:
            # Tokenize input
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
            
            # Generate with optimized parameters for classification
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=200,  # Reduced for focused output
                    temperature=0.3,     # Lower for more deterministic classification
                    top_p=0.8,          # Focused sampling
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    repetition_penalty=1.1  # Prevent repetition
                )
            
            # Decode response
            response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
            
            # Extract JSON
            result = extract_json_from_response(response)
            
            if result and all(key in result for key in ['classification', 'confidence', 'reasoning']):
                # Validate classification value
                valid_classifications = ['advertisement', 'irrelevant', 'rant_without_visit', 'legitimate']
                if result['classification'] in valid_classifications:
                    return {
                        'classification': result['classification'],
                        'confidence': float(result.get('confidence', 0.5)),
                        'reasoning': result.get('reasoning', ''),
                        'indicators': result.get('indicators', []),
                        'raw_response': response,
                        'attempt': attempt + 1
                    }
            
        except Exception as e:
            if attempt == max_retries - 1:
                # Return fallback result on final failure
                return {
                    'classification': 'legitimate',  # Conservative fallback
                    'confidence': 0.3,
                    'reasoning': f'Classification failed after {max_retries} attempts: {str(e)}',
                    'indicators': ['classification_error'],
                    'raw_response': response if 'response' in locals() else '',
                    'attempt': attempt + 1,
                    'error': True
                }
    
    return {
        'classification': 'legitimate',
        'confidence': 0.3,
        'reasoning': 'Classification failed - defaulting to legitimate',
        'indicators': ['fallback_classification'],
        'raw_response': '',
        'attempt': max_retries,
        'error': True
    }

def test_with_real_ndk_data(sample_size: int = 20):
    """Test the LLM classification using real NDK dataset"""
    
    # Load the datasets
    image_reviews, text_reviews = load_ndk_datasets()
    
    # Focus on text-only reviews for LLM testing (no images)
    test_data = text_reviews.copy()
    
    # Sample diverse reviews for testing
    if len(test_data) > sample_size:
        test_sample = test_data.sample(n=sample_size, random_state=42)
    else:
        test_sample = test_data
    
    print(f"\n=== Testing LLM Classification on {len(test_sample)} Real NDK Reviews ===\n")
    
    results = []
    classification_counts = {'legitimate': 0, 'advertisement': 0, 'irrelevant': 0, 'rant_without_visit': 0, 'error': 0}
    
    for idx, row in test_sample.iterrows():
        # Extract place description from category
        place_description = str(row['category']).strip("[]'").replace("'", "") if 'category' in row else "Business"
        review_text = str(row['text'])
        rating = row['rating'] if 'rating' in row else 'N/A'
        
        print(f"--- Review {len(results) + 1} ---")
        print(f"Place Type: {place_description}")
        print(f"Rating: {rating}")
        print(f"Text: {review_text[:100]}{'...' if len(review_text) > 100 else ''}")
        
        # Classify the review
        result = classify_review(place_description, review_text)
        
        # Add metadata
        result.update({
            'original_rating': rating,
            'place_type': place_description,
            'review_length': len(review_text),
            'review_text': review_text
        })
        
        results.append(result)
        
        # Count classifications
        classification = result['classification']
        if 'error' in result and result['error']:
            classification_counts['error'] += 1
        else:
            classification_counts[classification] += 1
        
        print(f"Classification: {result['classification']} (confidence: {result['confidence']:.2f})")
        print(f"Reasoning: {result['reasoning']}")
        print(f"Indicators: {result['indicators']}")
        print("-" * 80)
    
    # Summary statistics
    print(f"\n=== Classification Summary ===")
    total_tested = len(results)
    for classification, count in classification_counts.items():
        percentage = (count / total_tested) * 100
        print(f"{classification.replace('_', ' ').title()}: {count}/{total_tested} ({percentage:.1f}%)")
    
    # Analyze patterns
    print(f"\n=== Pattern Analysis ===")
    avg_confidence = np.mean([r['confidence'] for r in results if not r.get('error', False)])
    print(f"Average Confidence: {avg_confidence:.3f}")
    
    # High confidence classifications
    high_confidence = [r for r in results if r['confidence'] > 0.8 and not r.get('error', False)]
    print(f"High Confidence (>0.8): {len(high_confidence)}/{total_tested} ({len(high_confidence)/total_tested*100:.1f}%)")
    
    # Low confidence classifications needing review
    low_confidence = [r for r in results if r['confidence'] < 0.6 and not r.get('error', False)]
    if low_confidence:
        print(f"\nLow Confidence Reviews (<0.6) - May need human review:")
        for r in low_confidence:
            print(f"- {r['classification']} ({r['confidence']:.2f}): {r['review_text'][:80]}...")
    
    return results, classification_counts

def analyze_text_linking():
    """Analyze how text field can be used to link the two datasets"""
    
    image_reviews, text_reviews = load_ndk_datasets()
    
    print("=== Dataset Linking Analysis ===")
    print(f"Image reviews sample texts:")
    for i, text in enumerate(image_reviews['text'].head(3)):
        print(f"{i+1}. {text}")
    
    print(f"\nText-only reviews sample texts:")  
    for i, text in enumerate(text_reviews['text'].head(3)):
        print(f"{i+1}. {text}")
    
    # Check for any text overlaps (unlikely but worth checking)
    image_texts = set(image_reviews['text'].str.lower().str.strip())
    text_texts = set(text_reviews['text'].str.lower().str.strip())
    overlaps = image_texts.intersection(text_texts)
    
    print(f"\nText overlaps between datasets: {len(overlaps)}")
    if overlaps:
        for overlap in list(overlaps)[:3]:  # Show first 3 overlaps
            print(f"- {overlap}")

# Run the analysis and testing
analyze_text_linking()
results, counts = test_with_real_ndk_data(sample_size=15)

Loaded 21747 reviews with images
Loaded 287975 text-only reviews
=== Dataset Linking Analysis ===
Image reviews sample texts:
1. Highly recommend! These guys did a great job and were very courteous. The home warranty we have on our new home had us contact True Comfort for a furnace replacement and they did a fantastic job. The furnace arrived within a week and they were over the next day to replace our old one. Now we are ready for winter!
2. Love love will be back again pedicure was amazing
3. This gas station has the beat breakfast pizza of all time. Sausage and gravy, what.

Text-only reviews sample texts:
1. Fast quick service. I go here almost all the time for my oil changes. The staff odds polite and friendly.
2. Mike and crew do a great job.  Dropped off vehicle and picked up before close.  Great pricing also.  Highly recommend
3. Amazing love this place!

Text overlaps between datasets: 8078
- i really love my new lincoln mkc. at first, they didn't have the color i wanted so th