In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.20.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [None]:
%%writefile preprocess.py
import os
import torch
import numpy as np
import json
import argparse
from tqdm import tqdm

def parse_args():
    parser = argparse.ArgumentParser(description='准备模型部署所需文件')
    parser.add_argument('--embedding_path', type=str, required=True, help='GloVe词嵌入文件路径')
    parser.add_argument('--relation_path', type=str, required=True, help='关系映射文件路径')
    parser.add_argument('--model_path', type=str, required=True, help='训练好的模型权重文件路径')
    parser.add_argument('--output_dir', type=str, required=True, help='输出目录')
    parser.add_argument('--word_dim', type=int, default=100, help='词嵌入维度')
    return parser.parse_args()

def prepare_embeddings(embedding_filepath, embedding_dimension, output_dir):
    """处理并保存词嵌入"""
    print(f"正在处理词嵌入: {embedding_filepath}")

    word_to_id = {'PAD': 0, 'UNK': 1, '<e1>': 2, '<e2>': 3, '</e1>': 4, '</e2>': 5}
    word_vectors = []

    # 读取GloVe词嵌入
    with open(embedding_filepath, 'r', encoding='UTF-8') as file:
        for line in tqdm(file, desc="读取词嵌入"):
            elements = line.strip().split()
            if len(elements) == embedding_dimension + 1:
                word, vector = elements[0], np.array(elements[1:], dtype=np.float32)
                word_to_id[word] = len(word_to_id)
                word_vectors.append(vector)

    # 处理词嵌入
    word_vectors = np.stack(word_vectors)
    vector_mean, vector_std = word_vectors.mean(), word_vectors.std()
    special_vectors = np.random.normal(vector_mean, vector_std, (6, embedding_dimension))
    special_vectors[0] = 0  # PAD向量设为0
    word_vectors = np.concatenate((special_vectors, word_vectors), axis=0)

    # 转换为PyTorch张量并保存
    word_embeddings = torch.from_numpy(word_vectors.astype(np.float32))
    torch.save(word_embeddings, os.path.join(output_dir, 'word_embeddings.pt'))

    # 保存word_to_id映射
    with open(os.path.join(output_dir, 'word_to_id.json'), 'w', encoding='utf-8') as f:
        json.dump(word_to_id, f, ensure_ascii=False, indent=2)

    print(f"词嵌入已保存，词汇量: {len(word_to_id)}")
    return word_to_id, word_embeddings

def prepare_relation_mapping(relation_filepath, output_dir):
    """处理并保存关系映射"""
    print(f"正在处理关系映射: {relation_filepath}")

    relation_to_id, id_to_relation = {}, {}

    try:
        with open(relation_filepath, 'r', encoding='UTF-8') as file:
            for line in file:
                relation, id_str = line.strip().split()
                relation_id = int(id_str)
                relation_to_id[relation] = relation_id
                id_to_relation[relation_id] = relation

        # 保存关系映射
        relation_maps = {
            'relation_to_id': relation_to_id,
            'id_to_relation': id_to_relation
        }

        with open(os.path.join(output_dir, 'relation_maps.json'), 'w', encoding='utf-8') as f:
            json.dump(relation_maps, f, ensure_ascii=False, indent=2)

        print(f"关系映射已保存，关系类型数: {len(relation_to_id)}")
        return relation_to_id, id_to_relation
    except Exception as e:
        print(f"处理关系映射时出错: {e}")
        raise

def copy_model(model_path, output_dir):
    """复制模型权重文件"""
    print(f"正在复制模型权重: {model_path}")

    try:
        # 加载模型权重以验证格式
        model_weights = torch.load(model_path, map_location='cpu')
        # 保存到输出目录
        torch.save(model_weights, os.path.join(output_dir, 'final_model.pkl'))
        print("模型权重已复制")
    except Exception as e:
        print(f"复制模型权重时出错: {e}")
        raise

def main():
    args = parse_args()

    # 创建输出目录
    os.makedirs(args.output_dir, exist_ok=True)

    # 处理并保存词嵌入
    prepare_embeddings(args.embedding_path, args.word_dim, args.output_dir)

    # 处理并保存关系映射
    prepare_relation_mapping(args.relation_path, args.output_dir)

    # 复制模型权重
    copy_model(args.model_path, args.output_dir)

    print(f"预处理完成! 所有文件已保存到: {args.output_dir}")

if __name__ == "__main__":
    main()

Writing preprocess.py


In [None]:
# 创建目录
!mkdir -p /content/hf-deployment/model

# 运行预处理脚本
!python preprocess.py \
  --embedding_path /content/drive/MyDrive/nlp/textmining_CW/embedding/glove.6B.100d.txt \
  --relation_path /content/drive/MyDrive/nlp/textmining_CW/data/relation_with_id.txt \
  --model_path /content/drive/MyDrive/nlp/textmining_CW/output/Att_LSTM/final_model.pkl \
  --output_dir /content/hf-deployment/model \
  --word_dim 100

正在处理词嵌入: /content/drive/MyDrive/nlp/textmining_CW/embedding/glove.6B.100d.txt
读取词嵌入: 400000it [00:19, 20415.95it/s]
词嵌入已保存，词汇量: 400006
正在处理关系映射: /content/drive/MyDrive/nlp/textmining_CW/data/relation_with_id.txt
关系映射已保存，关系类型数: 19
正在复制模型权重: /content/drive/MyDrive/nlp/textmining_CW/output/Att_LSTM/final_model.pkl
  model_weights = torch.load(model_path, map_location='cpu')
模型权重已复制
预处理完成! 所有文件已保存到: /content/hf-deployment/model


In [None]:
%%writefile /content/hf-deployment/requirements.txt
torch==2.0.1
numpy==1.24.3
gradio==3.50.0

Writing /content/hf-deployment/requirements.txt


In [None]:
%%writefile /content/hf-deployment/README.md
---
title: LSTM关系抽取模型
emoji: 🔍
colorFrom: blue
colorTo: indigo
sdk: gradio
sdk_version: 3.50.0
app_file: app.py
pinned: false
license: mit
---

# LSTM关系抽取模型

这个应用使用LSTM+多头注意力机制来预测句子中两个实体之间的关系。

## 使用说明

输入包含两个实体标记的句子，模型将预测它们之间的关系。

### 格式要求
- 使用 `<e1>` 和 `</e1>` 标记第一个实体
- 使用 `<e2>` 和 `</e2>` 标记第二个实体

## 示例

- `<e1>John</e1> works at <e2>Google</e2>.`
- `<e1>Aspirin</e1> is used to treat <e2>headaches</e2>.`
- `<e1>John</e1> is the father of <e2>Alice</e2>.`

## 模型架构

该模型基于LSTM结合多头注意力机制，使用预训练的GloVe词向量进行初始化。

Writing /content/hf-deployment/README.md


In [None]:
%%writefile /content/hf-deployment/app.py
import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import os
import numpy as np
import json

# 配置类
class Configuration:
    """配置类"""
    def __init__(self):
        self.model_dir = os.environ.get('MODEL_DIR', './model')
        self.word_dim = int(os.environ.get('WORD_DIM', '100'))
        self.max_len = int(os.environ.get('MAX_LEN', '100'))
        self.hidden_size = int(os.environ.get('HIDDEN_SIZE', '100'))
        self.num_heads = int(os.environ.get('NUM_HEADS', '4'))
        self.layers_num = int(os.environ.get('LAYERS_NUM', '1'))
        self.emb_dropout = float(os.environ.get('EMB_DROPOUT', '0.3'))
        self.lstm_dropout = float(os.environ.get('LSTM_DROPOUT', '0.3'))
        self.linear_dropout = float(os.environ.get('LINEAR_DROPOUT', '0.5'))

        # 检测设备
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

# 多头注意力机制模型
class EnhancedAttention(nn.Module):
    """多头注意力机制"""
    def __init__(self, model_dim, head_num, Dropout_Rate=0.1):
        super().__init__()
        self.head_num = head_num
        self.key_dim = model_dim // head_num
        self.query_transform = nn.Linear(model_dim, model_dim)
        self.key_transform = nn.Linear(model_dim, model_dim)
        self.value_transform = nn.Linear(model_dim, model_dim)
        self.output_transform = nn.Linear(model_dim, model_dim)
        self.dropout = nn.Dropout(Dropout_Rate)

    def _calculate_attention(self, query, key, value, mask=None):
        attention_scores0 = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.key_dim)
        if mask is not None:
            attention_scores1 = attention_scores0.masked_fill(mask == 0, -1e9)
        else:
            attention_scores1 = attention_scores0
        attention_weights1 = F.softmax(attention_scores1, dim=-1)
        attention_weights2 = self.dropout(attention_weights1)
        return torch.matmul(attention_weights2, value)

    def _split_heads(self, tensor):
        batch_size, seq_len, model_dim = tensor.size()
        return tensor.view(batch_size, seq_len, self.head_num, self.key_dim).transpose(1, 2)

    def _merge_heads(self, tensor):
        batch_size, _, seq_len, key_dim = tensor.size()
        return tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, self.head_num * key_dim)

    def forward(self, query, key, value, mask=None):
        query, key, value = self.query_transform(query), self.key_transform(key), self.value_transform(value)
        query_split, key_split, value_split = self._split_heads(query), self._split_heads(key), self._split_heads(value)
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(1).expand(-1, self.head_num, -1, -1).bool()
        output_split = self._calculate_attention(query_split, key_split, value_split, mask)
        output = self._merge_heads(output_split)
        return self.output_transform(output)

# LSTM结合注意力机制的模型
class AttentiveLSTM(nn.Module):
    """LSTM结合注意力机制的模型"""
    def __init__(self, word_embeddings, num_classes, settings):
        super().__init__()
        self.word_embeddings = word_embeddings
        self.num_classes = num_classes
        self.max_sequence_length = settings.max_len
        self.embedding_dim = settings.word_dim
        self.lstm_hidden_size = settings.hidden_size
        self.embedding_dropout = nn.Dropout(settings.emb_dropout)
        self.lstm_dropout = nn.Dropout(settings.lstm_dropout)
        self.linear_dropout = nn.Dropout(settings.linear_dropout)
        self.attention_heads = settings.num_heads
        self.embedding_layer = nn.Embedding.from_pretrained(embeddings=self.word_embeddings, freeze=False)

        self.lstm_layer = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_hidden_size,
            batch_first=True,
            num_layers=settings.layers_num,
            bidirectional=False
        )

        self.attention_mechanism = EnhancedAttention(self.lstm_hidden_size, self.attention_heads)
        self.output_layer = nn.Linear(in_features=self.lstm_hidden_size, out_features=self.num_classes)
        # 初始化权重
        nn.init.xavier_normal_(self.output_layer.weight)
        nn.init.constant_(self.output_layer.bias, 0.)

    def _process_with_lstm(self, input_data, mask):
        len_total = torch.sum(mask.gt(0), dim=-1).cpu().type(torch.int64)
        packinput_inf = nn.utils.rnn.pack_padded_sequence(input_data, len_total, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm_layer(packinput_inf)
        unpacked_output, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, padding_value=0.0, total_length=self.max_sequence_length)
        return unpacked_output

    def forward(self, input_data):
        tokens = input_data[:, 0, :].squeeze(1)
        mask = input_data[:, 1, :].squeeze(1)
        embedded_tokens = self.embedding_dropout(self.embedding_layer(tokens))
        lstm_hide_states = self.lstm_dropout(self._process_with_lstm(embedded_tokens, mask))
        attended_states = self.attention_mechanism(lstm_hide_states, lstm_hide_states, lstm_hide_states, mask)
        mean_attended_states = torch.mean(attended_states, dim=1)
        regularized_states = self.linear_dropout(mean_attended_states)
        return self.output_layer(regularized_states)

# 全局变量
model = None
word_to_id = None
id_to_relation = None
config = None

# 加载模型
def load_model():
    global model, word_to_id, id_to_relation, config

    print("正在加载模型...")
    config = Configuration()

    try:
        # 加载词嵌入
        embedding_cache_path = os.path.join(config.model_dir, 'word_embeddings.pt')
        word_embeddings = torch.load(embedding_cache_path, map_location='cpu')

        # 加载字典
        with open(os.path.join(config.model_dir, 'word_to_id.json'), 'r', encoding='utf-8') as f:
            word_to_id = json.load(f)

        # 加载关系映射
        relation_cache_path = os.path.join(config.model_dir, 'relation_maps.json')
        with open(relation_cache_path, 'r', encoding='utf-8') as f:
            relation_data = json.load(f)

        # 将字符串键转回整数
        id_to_relation = {int(k): v for k, v in relation_data['id_to_relation'].items()}
        relation_to_id = relation_data['relation_to_id']

        # 初始化模型
        model = AttentiveLSTM(
            word_embeddings=word_embeddings,
            num_classes=len(relation_to_id),
            settings=config
        ).to(config.device)

        # 加载模型权重
        model_path = os.path.join(config.model_dir, 'final_model.pkl')
        model.load_state_dict(torch.load(model_path, map_location=config.device))
        model.eval()

        print(f"模型已成功加载到 {config.device} 设备")
        return True
    except Exception as e:
        print(f"模型加载失败: {str(e)}")
        return False

# 预处理输入句子
def prepare_input_sentence(sentence, word_to_id, max_length):
    """将输入句子转换为模型可用的格式"""
    tokens = sentence.strip().split()
    token_ids = [word_to_id.get(token.lower(), word_to_id['UNK']) for token in tokens]

    # 填充或截断到固定长度
    if len(token_ids) < max_length:
        token_ids_padded = token_ids + [word_to_id['PAD']] * (max_length - len(token_ids))
    else:
        token_ids_padded = token_ids[:max_length]

    # 创建掩码
    mask = [1] * min(len(tokens), max_length) + [0] * (max_length - min(len(tokens), max_length))

    # 转换为张量
    tokens_tensor = torch.tensor([token_ids_padded], dtype=torch.long)
    mask_tensor = torch.tensor([mask], dtype=torch.long)

    # 堆叠为模型输入格式
    data = torch.stack([tokens_tensor, mask_tensor], dim=1)
    return data

# 预测函数
def predict_relation(sentence):
    global model, word_to_id, id_to_relation, config

    # 检查模型是否已加载
    if model is None:
        success = load_model()
        if not success:
            return "错误：模型加载失败，请检查模型文件"

    # 检查输入
    if not "<e1>" in sentence or not "<e2>" in sentence:
        return "错误：句子必须包含实体标记，例如: '<e1>John</e1> works at <e2>Google</e2>.'"

    try:
        # 准备输入数据
        data = prepare_input_sentence(sentence, word_to_id, config.max_len).to(config.device)

        # 预测
        with torch.no_grad():
            logits = model(data)
            probabilities = F.softmax(logits, dim=1)
            max_prob, prediction = torch.max(probabilities, dim=1)
            pred_idx = prediction.cpu().item()
            confidence = max_prob.cpu().item()

        # 返回结果
        relation_type = id_to_relation[pred_idx]
        confidence_percent = confidence * 100

        # 格式化输出结果
        result = f"预测关系: {relation_type}\n"
        result += f"置信度: {confidence_percent:.2f}%\n\n"

        # 显示所有关系的概率分布
        result += "所有关系的概率分布:\n"
        probs = probabilities[0].cpu().numpy()
        for idx, prob in enumerate(probs):
            if idx in id_to_relation:
                rel_name = id_to_relation[idx]
                result += f"- {rel_name}: {prob*100:.2f}%\n"

        return result
    except Exception as e:
        return f"预测过程中出错: {str(e)}"

# 尝试加载模型
load_model()

# 创建Gradio界面
demo = gr.Interface(
    fn=predict_relation,
    inputs=gr.Textbox(
        placeholder="例如：<e1>John</e1> works at <e2>Google</e2>.",
        label="输入句子"
    ),
    outputs=gr.Textbox(label="预测结果"),
    title="LSTM关系抽取模型",
    description="""
    ## 使用说明
    这个应用使用LSTM+注意力机制来预测句子中两个实体之间的关系。

    请输入包含两个实体标记的句子，模型将预测它们之间的关系。

    **格式要求**：
    - 使用 `<e1>` 和 `</e1>` 标记第一个实体
    - 使用 `<e2>` 和 `</e2>` 标记第二个实体
    """,
    examples=[
        ["<e1>John</e1> works at <e2>Google</e2>."],
        ["<e1>Aspirin</e1> is used to treat <e2>headaches</e2>."],
        ["<e1>John</e1> is the father of <e2>Alice</e2>."],
        ["<e1>The book</e1> is on <e2>the table</e2>."]
    ],
    theme=gr.themes.Soft()
)

# 启动界面
if __name__ == "__main__":
    demo.launch(share=True)

Overwriting /content/hf-deployment/app.py


In [None]:
%%writefile /content/hf-deployment/.gitattributes
*.pt filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text

Overwriting /content/hf-deployment/.gitattributes


In [None]:
%cd /content/hf-deployment
!python app.py

/content/hf-deployment
正在加载模型...
  word_embeddings = torch.load(embedding_cache_path, map_location='cpu')
  model.load_state_dict(torch.load(model_path, map_location=config.device))
模型已成功加载到 cpu 设备
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://f776f54a6af9214194.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f776f54a6af9214194.gradio.live


In [None]:
!cd /content && zip -r hf-deployment.zip hf-deployment/
from google.colab import files
files.download('/content/hf-deployment.zip')

  adding: hf-deployment/ (stored 0%)
  adding: hf-deployment/.gitattributes (deflated 47%)
  adding: hf-deployment/README.md (deflated 34%)
  adding: hf-deployment/model/ (stored 0%)
  adding: hf-deployment/model/word_to_id.json (deflated 64%)
  adding: hf-deployment/model/relation_maps.json (deflated 74%)
  adding: hf-deployment/model/final_model.pkl (deflated 8%)
  adding: hf-deployment/model/word_embeddings.pt (deflated 8%)
  adding: hf-deployment/requirements.txt (stored 0%)
  adding: hf-deployment/.gradio/ (stored 0%)
  adding: hf-deployment/.gradio/certificate.pem (deflated 24%)
  adding: hf-deployment/app.py (deflated 67%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>