In [1]:
# Celda 1: Setup
import mlflow
from pathlib import Path
import sys
import json
import pandas as pd

sys.path.insert(0, str(Path.cwd().parent))

from src.property_analysis.text_analyzer import PropertyTextAnalyzer
from src.property_analysis.schemas import QueryRequirement

# MLflow
project_root = Path.cwd().parent
mlflow.set_tracking_uri(f"file://{project_root}/mlruns")
mlflow.set_experiment("flexible_text_analysis_v2")

print("‚úÖ Setup complete")

ImportError: cannot import name 'cached_download' from 'huggingface_hub' (/opt/anaconda3/envs/realestate-ai/lib/python3.10/site-packages/huggingface_hub/__init__.py)

In [None]:
# Celda 2: Cargar datos
with open('../data/raw/fotocasa_20251123_154917.json') as f:
    data = json.load(f)

properties = data['properties']
print(f"üìä Loaded {len(properties)} properties")

In [None]:
# Celda 3: An√°lisis completo (features + embeddings)
analyzer = PropertyTextAnalyzer()

print("üîç Analyzing properties with embeddings...\n")

with mlflow.start_run(run_name="v2.0_with_embeddings"):
    results = analyzer.analyze_batch(
        properties[:3],  # Primeras 3 para testing
        generate_embeddings=True
    )
    
    # Log metrics
    mlflow.log_param("analyzer_version", "v2.0")
    mlflow.log_param("embedding_model", "paraphrase-multilingual-MiniLM-L12-v2")
    mlflow.log_metric("properties_analyzed", len(results))
    
    avg_features = sum(len(r.detected_features) for r in results) / len(results)
    mlflow.log_metric("avg_features_per_property", avg_features)

print("‚úÖ Analysis complete")

In [None]:
# Celda 4: Visualizar features detectados
for i, result in enumerate(results, 1):
    prop = properties[i-1]
    
    print(f"\n{'='*70}")
    print(f"üè† Property {i}: {prop['id']}")
    print(f"üìç {prop['location']} | {prop['price']:,}‚Ç¨")
    print(f"\nüìù Description (first 200 chars):")
    print(f"   {prop['description'][:200]}...")
    
    print(f"\n‚ú® Detected Features ({len(result.detected_features)} total):")
    
    # Ordenar por confidence
    sorted_features = sorted(
        result.detected_features,
        key=lambda f: f.confidence,
        reverse=True
    )
    
    for feature in sorted_features[:10]:  # Top 10
        bar = "‚ñà" * int(feature.confidence * 10)
        value_str = f" = {feature.value}" if feature.value else ""
        print(f"   {feature.name:.<35} {feature.confidence:.2f} {bar}{value_str}")
    
    print(f"\nüìä Quality Score: {result.overall_quality_score:.2f}")
    print(f"üî¢ Embedding dims: {len(result.text_embedding) if result.text_embedding else 0}")

In [None]:
# Celda 5: Test de matching sem√°ntico
query_text = "Local con cocina equipada, entrada independiente y terraza"

requirements = [
    QueryRequirement(feature_name="cocina_equipada", importance=1.0, required=True),
    QueryRequirement(feature_name="entrada_independiente", importance=0.9),
    QueryRequirement(feature_name="terraza", importance=0.7)
]

print(f"üîç Query: {query_text}\n")
print("Required features:")
for req in requirements:
    print(f"   - {req.feature_name} (importance: {req.importance})")

print(f"\n{'='*70}")
print("MATCHING RESULTS\n")

for result in results:
    match = analyzer.match_against_query(result, query_text, requirements)
    
    print(f"Property: {match.property_id}")
    print(f"   Final Score: {match.final_score:.2f}")
    print(f"   Feature Match: {match.feature_match_score:.2f}")
    print(f"   Semantic Similarity: {match.semantic_similarity_score:.2f}")
    print(f"   Matched: {len(match.matched_features)} features")
    print(f"   Missing: {', '.join(match.missing_requirements) if match.missing_requirements else 'none'}")
    print(f"   Good Match: {'‚úÖ YES' if match.is_good_match() else '‚ùå NO'}")
    print()