In [1]:
import numpy as np
data = np.load('env_info.npz')
data.files

['e']

In [2]:
e = data['e']
e.shape

(49, 168, 17)

In [3]:
x = e[0,:,:]


In [4]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import json
import time
import os

class POITextGenerator:
    def __init__(self):
        self.poi_categories = [
            'Medical care', 'Hotel', 'Business affairs', 'Life service', 
            'Transportation hub', 'Culture', 'Sports', 'Residence', 
            'Entertainment and leisure', 'Scenic spot', 'Government', 
            'Factory', 'Shopping', 'Restaurant', 'Education', 'Landmark', 'Other'
        ]
        
        self.poi_descriptions = {
            'Medical care': ['hospital', 'clinic', 'pharmacy', 'medical center', 'healthcare facility'],
            'Hotel': ['hotel', 'accommodation', 'lodge', 'inn', 'resort'],
            'Business affairs': ['office building', 'business center', 'corporate area', 'commercial district'],
            'Life service': ['service center', 'utility office', 'community service', 'public service'],
            'Transportation hub': ['station', 'terminal', 'transport hub', 'transit center', 'airport'],
            'Culture': ['museum', 'gallery', 'cultural center', 'art venue', 'library'],
            'Sports': ['gym', 'sports center', 'stadium', 'fitness facility', 'athletic venue'],
            'Residence': ['residential area', 'housing complex', 'apartment building', 'neighborhood'],
            'Entertainment and leisure': ['entertainment venue', 'leisure center', 'recreation area', 'amusement'],
            'Scenic spot': ['tourist attraction', 'scenic area', 'landmark', 'viewpoint', 'park'],
            'Government': ['government building', 'public office', 'administrative center', 'city hall'],
            'Factory': ['industrial area', 'manufacturing plant', 'factory', 'production facility'],
            'Shopping': ['shopping mall', 'retail store', 'market', 'shopping center', 'commercial area'],
            'Restaurant': ['restaurant', 'dining area', 'food court', 'cafe', 'eatery'],
            'Education': ['school', 'university', 'educational institution', 'campus', 'learning center'],
            'Landmark': ['landmark', 'monument', 'historic site', 'notable building', 'famous location'],
            'Other': ['mixed area', 'general facility', 'unspecified location', 'other venue']
        }

    def generate_text_description(self, poi_vector, sampling_point=None, user_id=None):
        """为单个采样点的POI向量生成文本描述"""
        top_indices = np.argsort(poi_vector)[-5:][::-1]
        top_values = poi_vector[top_indices]
        
        significant_pois = [(idx, val) for idx, val in zip(top_indices, top_values) if val > 0.1]
        
        if not significant_pois:
            return "This location has minimal POI activity."
        
        descriptions = []
        for poi_idx, intensity in significant_pois[:3]:
            category = self.poi_categories[poi_idx]
            desc_words = self.poi_descriptions[category]
            
            if intensity > 0.8:
                prefix = "heavily concentrated with"
            elif intensity > 0.5:
                prefix = "moderately populated with"
            else:
                prefix = "has some"
            
            desc = f"{prefix} {desc_words[0]} facilities"
            descriptions.append(desc)
        
        if len(descriptions) == 1:
            text = f"This area {descriptions[0]}."
        elif len(descriptions) == 2:
            text = f"This area {descriptions[0]} and {descriptions[1]}."
        else:
            text = f"This area {', '.join(descriptions[:-1])}, and {descriptions[-1]}."
            
        return text

    def generate_all_descriptions(self, data):
        """为所有用户的所有采样点生成文本描述"""
        descriptions = []
        total_points = data.shape[0] * data.shape[1]
        
        # 创建总进度条
        with tqdm(total=total_points, desc="🔤 Generating text descriptions", 
                  unit="points", colour="green") as pbar:
            
            for user_id in range(data.shape[0]):
                user_descriptions = []
                
                # 用户级别的进度信息
                pbar.set_postfix({
                    'User': f"{user_id + 1}/{data.shape[0]}",
                    'Current': f"Processing user {user_id + 1}"
                })
                
                for sampling_point in range(data.shape[1]):
                    poi_vector = data[user_id, sampling_point, :]
                    text = self.generate_text_description(poi_vector, sampling_point, user_id)
                    user_descriptions.append(text)
                    
                    # 更新进度条
                    pbar.update(1)
                    
                    # 每50个点显示一次详细信息
                    if (sampling_point + 1) % 50 == 0:
                        pbar.set_postfix({
                            'User': f"{user_id + 1}/{data.shape[0]}",
                            'Point': f"{sampling_point + 1}/{data.shape[1]}"
                        })
                
                descriptions.append(user_descriptions)
                
        return descriptions

class POITextEncoder(nn.Module):
    def __init__(self, model_name='bert-base-uncased', hidden_dim=256, output_dim=128):
        super(POITextEncoder, self).__init__()
        
        print("🤖 Initializing BERT model...")
        with tqdm(total=2, desc="Loading model components", colour="blue") as pbar:
            self.tokenizer = BertTokenizer.from_pretrained(model_name)
            pbar.update(1)
            pbar.set_postfix({'Component': 'Tokenizer loaded'})
            
            self.bert = BertModel.from_pretrained(model_name)
            pbar.update(1)
            pbar.set_postfix({'Component': 'BERT model loaded'})
        
        self.projection = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, output_dim)
        )
        
        # 冻结BERT参数
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def forward(self, texts):
        """编码文本列表为embeddings"""
        if isinstance(texts, str):
            texts = [texts]
        
        encoded = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        
        with torch.no_grad():
            outputs = self.bert(**encoded)
            pooled_output = outputs.pooler_output
        
        embeddings = self.projection(pooled_output)
        
        return embeddings
    
    def encode_batch(self, text_batch, batch_size=32):
        """批量编码文本"""
        all_embeddings = []
        
        # 创建批处理进度条
        total_batches = (len(text_batch) + batch_size - 1) // batch_size
        
        with tqdm(total=total_batches, desc="🧠 Encoding text batches", 
                  unit="batch", colour="yellow") as pbar:
            
            for i in range(0, len(text_batch), batch_size):
                batch_texts = text_batch[i:i+batch_size]
                
                # 显示当前批次信息
                pbar.set_postfix({
                    'Batch size': len(batch_texts),
                    'Progress': f"{i + len(batch_texts)}/{len(text_batch)} texts"
                })
                
                embeddings = self.forward(batch_texts)
                all_embeddings.append(embeddings.detach())
                
                pbar.update(1)
                
                # 模拟一些处理时间（实际使用时可以删除）
                time.sleep(0.01)
        
        return torch.cat(all_embeddings, dim=0)

def analyze_poi_distribution(poi_data):
    """分析POI数据分布"""
    print("\n" + "="*50)
    print("📊 POI Data Analysis")
    print("="*50)
    
    poi_categories = [
        'Medical care', 'Hotel', 'Business affairs', 'Life service', 
        'Transportation hub', 'Culture', 'Sports', 'Residence', 
        'Entertainment and leisure', 'Scenic spot', 'Government', 
        'Factory', 'Shopping', 'Restaurant', 'Education', 'Landmark', 'Other'
    ]
    
    print(f"Shape: {poi_data.shape}")
    print(f"Mean: {poi_data.mean():.4f}")
    print(f"Std: {poi_data.std():.4f}")
    print(f"Min: {poi_data.min():.4f}")
    print(f"Max: {poi_data.max():.4f}")
    
    print("\nPer-category statistics:")
    with tqdm(poi_categories, desc="Analyzing categories", colour="cyan") as pbar:
        for i, category in enumerate(pbar):
            category_data = poi_data[:, :, i]
            pbar.set_postfix({
                'Category': category[:20] + "..." if len(category) > 20 else category
            })
            print(f"  {category}: mean={category_data.mean():.4f}, std={category_data.std():.4f}")
            time.sleep(0.1)  # 显示效果，实际使用时可以删除

def main():
    """主函数"""
    print("🚀 Starting POI Text Generation and Encoding Pipeline")
    print("="*60)
    
    # 总体进度跟踪
    total_steps = 6
    overall_progress = tqdm(total=total_steps, desc="📋 Overall Progress", 
                          colour="magenta", position=0)
    
    # 步骤1: 加载数据
    overall_progress.set_description("📁 Loading NPZ file...")
    try:
        data = np.load('env_info.npz')  # 替换为你的文件路径
        poi_data = data['e']
        print(f"✅ Data loaded successfully! Shape: {poi_data.shape}")
    except FileNotFoundError:
        # 如果文件不存在，创建示例数据
        print("⚠️  File not found, creating sample data...")
        poi_data = np.random.rand(49, 168, 17)
        print(f"✅ Sample data created! Shape: {poi_data.shape}")
    
    overall_progress.update(1)
    
    # 步骤2: 分析数据分布（可选）
    overall_progress.set_description("📊 Analyzing data distribution...")
    analyze_poi_distribution(poi_data)
    overall_progress.update(1)
    
    # 步骤3: 创建文本生成器
    overall_progress.set_description("🔧 Initializing text generator...")
    text_generator = POITextGenerator()
    print("✅ Text generator initialized!")
    overall_progress.update(1)
    
    # 步骤4: 生成文本描述
    overall_progress.set_description("🔤 Generating text descriptions...")
    all_descriptions = text_generator.generate_all_descriptions(poi_data)
    print(f"✅ Generated descriptions for {len(all_descriptions)} users!")
    overall_progress.update(1)
    
    # 显示样本
    print("\n" + "="*50)
    print("📝 Sample Descriptions")
    print("="*50)
    for i in range(min(2, len(all_descriptions))):
        print(f"\n👤 User {i + 1} (first 3 points):")
        for j in range(min(3, len(all_descriptions[i]))):
            print(f"  📍 Point {j + 1}: {all_descriptions[i][j]}")
    
    # 步骤5: 初始化编码器
    overall_progress.set_description("🤖 Initializing text encoder...")
    text_encoder = POITextEncoder(output_dim=128)
    overall_progress.update(1)
    
    # 步骤6: 编码文本
    overall_progress.set_description("🧠 Encoding texts...")
    
    # 准备数据
    flat_descriptions = []
    for user_desc in all_descriptions:
        flat_descriptions.extend(user_desc)
    
    print(f"\n📊 Total descriptions to encode: {len(flat_descriptions)}")
    
    # 批量编码
    with torch.no_grad():
        embeddings = text_encoder.encode_batch(flat_descriptions, batch_size=16)
    
    # 重新reshape
    embeddings = embeddings.view(poi_data.shape[0], poi_data.shape[1], -1)
    print(f"✅ Final embeddings shape: {embeddings.shape}")
    overall_progress.update(1)
    
    # 保存结果
    print("\n" + "="*50)
    print("💾 Saving Results")
    print("="*50)
    
    save_tasks = ["NPZ file", "JSON file"]
    with tqdm(save_tasks, desc="Saving files", colour="green") as save_pbar:
        # 保存NPZ
        save_pbar.set_postfix({'File': 'poi_text_embeddings.npz'})
        np.savez('poi_text_embeddings.npz', 
                 descriptions=np.array(all_descriptions, dtype=object),
                 embeddings=embeddings.numpy(),
                 categories=np.array(text_generator.poi_categories))
        save_pbar.update(1)
        
        # 保存JSON
        save_pbar.set_postfix({'File': 'poi_descriptions.json'})
        with open('poi_descriptions.json', 'w', encoding='utf-8') as f:
            json.dump({
                'descriptions': all_descriptions,
                'categories': text_generator.poi_categories,
                'metadata': {
                    'total_users': poi_data.shape[0],
                    'total_sampling_points': poi_data.shape[1],
                    'poi_dimensions': poi_data.shape[2],
                    'embedding_dimension': embeddings.shape[2]
                }
            }, f, indent=2, ensure_ascii=False)
        save_pbar.update(1)
    
    overall_progress.close()
    
    print("\n🎉 Pipeline completed successfully!")
    print("="*60)
    print("📁 Generated files:")
    print("  • poi_text_embeddings.npz - Contains embeddings and metadata")
    print("  • poi_descriptions.json - Contains text descriptions")
    print(f"📊 Summary:")
    print(f"  • Users processed: {poi_data.shape[0]}")
    print(f"  • Sampling points per user: {poi_data.shape[1]}")
    print(f"  • Total descriptions generated: {len(flat_descriptions)}")
    print(f"  • Embedding dimension: {embeddings.shape[2]}")
    
    return text_generator, text_encoder, all_descriptions, embeddings

if __name__ == "__main__":
    # 设置随机种子以保证可重现性
    np.random.seed(42)
    torch.manual_seed(42)
    
    # 运行主程序
    try:
        text_generator, text_encoder, descriptions, embeddings = main()
        print("\n✅ All tasks completed successfully!")
    except KeyboardInterrupt:
        print("\n⚠️  Process interrupted by user")
    except Exception as e:
        print(f"\n❌ Error occurred: {str(e)}")
        raise


🚀 Starting POI Text Generation and Encoding Pipeline


📊 Analyzing data distribution...:  17%|[35m█▋        [0m| 1/6 [00:00<00:00, 300.39it/s]

✅ Data loaded successfully! Shape: (49, 168, 17)

📊 POI Data Analysis
Shape: (49, 168, 17)
Mean: 0.0000
Std: 0.0000
Min: 0.0000
Max: 0.0000

Per-category statistics:




  Medical care: mean=0.0000, std=0.0000
  Hotel: mean=0.0000, std=0.0000




  Business affairs: mean=0.0000, std=0.0000
  Life service: mean=0.0000, std=0.0000




  Transportation hub: mean=0.0000, std=0.0000
  Culture: mean=0.0000, std=0.0000




  Sports: mean=0.0000, std=0.0000
  Residence: mean=0.0000, std=0.0000





  Entertainment and leisure: mean=0.0000, std=0.0000
  Scenic spot: mean=0.0000, std=0.0000


Analyzing categories:  59%|[36m█████▉    [0m| 10/17 [00:01<00:00,  9.75it/s, Category=Scenic spot][A

  Government: mean=0.0000, std=0.0000
  Factory: mean=0.0000, std=0.0000





  Shopping: mean=0.0000, std=0.0000
  Restaurant: mean=0.0000, std=0.0000


Analyzing categories:  82%|[36m████████▏ [0m| 14/17 [00:01<00:00,  9.73it/s, Category=Restaurant][A

  Education: mean=0.0000, std=0.0000
  Landmark: mean=0.0000, std=0.0000


Analyzing categories: 100%|[36m██████████[0m| 17/17 [00:01<00:00,  9.72it/s, Category=Other]
🔤 Generating text descriptions...:  50%|[35m█████     [0m| 3/6 [00:01<00:02,  1.14it/s]

  Other: mean=0.0000, std=0.0000
✅ Text generator initialized!


🔤 Generating text descriptions: 100%|[32m██████████[0m| 8232/8232 [00:00<00:00, 27862.16points/s, User=49/49, Point=150/168]
🤖 Initializing text encoder...:  67%|[35m██████▋   [0m| 4/6 [00:02<00:00,  2.23it/s]   

✅ Generated descriptions for 49 users!

📝 Sample Descriptions

👤 User 1 (first 3 points):
  📍 Point 1: This location has minimal POI activity.
  📍 Point 2: This location has minimal POI activity.
  📍 Point 3: This location has minimal POI activity.

👤 User 2 (first 3 points):
  📍 Point 1: This location has minimal POI activity.
  📍 Point 2: This location has minimal POI activity.
  📍 Point 3: This location has minimal POI activity.
🤖 Initializing BERT model...




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading model components: 100%|[34m██████████[0m| 2/2 [01:36<00:00, 48.16s/it, Component=BERT model loaded]
🧠 Encoding texts...:  83%|[35m████████▎ [0m| 5/6 [01:38<00:28, 28.81s/it]           


📊 Total descriptions to encode: 8232


🧠 Encoding text batches: 100%|[33m██████████[0m| 515/515 [00:35<00:00, 14.47batch/s, Batch size=8, Progress=8232/8232 texts]
🧠 Encoding texts...: 100%|[35m██████████[0m| 6/6 [02:13<00:00, 30.83s/it]

✅ Final embeddings shape: torch.Size([49, 168, 128])

💾 Saving Results


Saving files: 100%|[32m██████████[0m| 2/2 [00:00<00:00, 119.24it/s, File=poi_descriptions.json]
🧠 Encoding texts...: 100%|[35m██████████[0m| 6/6 [02:13<00:00, 22.33s/it]


🎉 Pipeline completed successfully!
📁 Generated files:
  • poi_text_embeddings.npz - Contains embeddings and metadata
  • poi_descriptions.json - Contains text descriptions
📊 Summary:
  • Users processed: 49
  • Sampling points per user: 168
  • Total descriptions generated: 8232
  • Embedding dimension: 128

✅ All tasks completed successfully!



