In [None]:
!pip install pyngrok


In [2]:
import tensorflow as tf
from transformers import BertTokenizer
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/bertmodel/new')
from bert import minBert
from bert import EncoderLayer, SelfAttention, FeedForward, BaseAttention
from positional_embedding import PositionalEmbedding

In [None]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [None]:
min_bert_layer = tf.keras.models.load_model(
    '/content/drive/MyDrive/Colab Notebooks/bertmodel/min_bert_layer_after_training_qqp_triplet_v3.keras',
    custom_objects={
        "minBert": minBert,
        "EncoderLayer": EncoderLayer,
        "SelfAttention": SelfAttention,
        "FeedForward": FeedForward,
        "BaseAttention" : BaseAttention,
        "PositionalEmbedding" : PositionalEmbedding
    }
)

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
def tokenize(sentence):
    return tokenizer(sentence, return_tensors="tf", add_special_tokens=True, padding='max_length', truncation=True, max_length=256)['input_ids']

def model_embedding(model, sentence):
    sentence_tokenized = tokenize(sentence)
    return model(sentence_tokenized, training=False)[:, 0, :].numpy()

In [6]:
def compute_sentence_similarity(sentence_1, sentence_2):
  embedded_sentence_1 = model_embedding(min_bert_layer, sentence_1)
  embedded_sentence_2 = model_embedding(min_bert_layer, sentence_2)

  return round(cosine_similarity(embedded_sentence_1, embedded_sentence_2).item(), 2) * 100

In [25]:
def find_most_similar(query, dataset):
    # Lấy embedding của câu query và dataset
    query_embedding = model_embedding(min_bert_layer, query)
    dataset_embeddings = model_embedding(min_bert_layer, dataset)

    # Đảm bảo query_embedding và dataset_embeddings là mảng 2D (1, embedding_dim)
    query_embedding = np.array(query_embedding).reshape(1, -1)  # Reshape thành mảng 2D nếu cần
    dataset_embeddings = np.array(dataset_embeddings)  # Dataset embeddings là mảng 2D

    # Tính cosine similarity giữa query_embedding và tất cả các câu trong dataset
    similarities = cosine_similarity(query_embedding, dataset_embeddings)

    # similarities có kích thước (1, N), tìm index của phần tử có giá trị cao nhất
    best_index = np.argmax(similarities)  # Trả về index của phần tử có độ tương đồng cao nhất

    # Trả về câu có độ tương đồng cao nhất
    return dataset[best_index]

In [8]:
import numpy as np
import pickle


def get_top_k_similarities(sentence, embeddings, sentences, k=10):
    """
    Lấy ra chỉ số của k phần tử có similarity cao nhất.

    Args:
    - similarities: Mảng chứa cosine similarity.
    - k: Số lượng phần tử có similarity cao nhất cần lấy (mặc định 10).

    Returns:
    - top_k_indices: Mảng chứa chỉ số của k phần tử có similarity cao nhất.
    """

    embedded_sentence = model_embedding(min_bert_layer, sentence)
    similarities = cosine_similarity(embedded_sentence, embeddings)

    if similarities.ndim == 2:
      similarities = similarities.flatten()
    # Lấy ra chỉ số của k phần tử có similarity cao nhất
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # argsort() + đảo ngược để lấy max similarity
    top_sentences = [sentences[i] for i in top_k_indices]
    return top_sentences

# Ví dụ sử dụng



In [10]:
def load_data_from_files(sentences_filename='sentences.txt', embeddings_filename='embeddings.pkl'):
    # Đọc các câu từ file text
    with open(sentences_filename, 'r') as f:
        sentences = f.readlines()
    sentences = [sentence.strip() for sentence in sentences]  # Loại bỏ ký tự newline

    # Đọc embeddings từ file pickle
    with open(embeddings_filename, 'rb') as file:
        embeddings = pickle.load(file)

    return sentences, embeddings

# Ví dụ sử dụng:
sentences, embeddings = load_data_from_files('/content/drive/MyDrive/Colab Notebooks/bertmodel/quora_sentences.txt', '/content/drive/MyDrive/Colab Notebooks/bertmodel/qoura_embeddings.pkl')


In [31]:
html_content = '''
        <!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <meta name="description"
    content="Tool to compute sentence similarity, find most similar sentence, and search similar sentences">
  <title>Sentence Similarity Tool</title>

  <!-- Bootstrap CSS -->
  <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">

  <style>
    body {
      background-color: #f8f9fa;
      font-family: Arial, sans-serif;
    }

    h1 {
      margin-bottom: 20px;
    }

    .card {
      margin-bottom: 20px;
    }

    .btn-primary {
      width: 100%;
    }

    .result-container {
      padding: 15px;
      background: #ffffff;
      border: 1px solid #ced4da;
      border-radius: 5px;
      color: #495057;
      margin-top: 10px;
    }

    #loading {
      display: none;
      color: #007bff;
      font-weight: bold;
    }
  </style>
</head>

<body>
  <div class="container mt-5">
    <header class="text-center mb-5">
      <h1 class="text-primary">Simple Sentence Similarity Tool</h1>
      <p class="text-muted">Trần Văn Quang, Lê Tiến Thực, Nguyễn Việt Cường</p>
    </header>

    <!-- Compute Sentence Similarity -->
    <div class="card shadow-sm">
      <div class="card-body">
        <h2 class="h5">Compute Sentence Similarity</h2>
        <div class="mb-3">
          <label for="sentence_1" class="form-label">Sentence 1</label>
          <input type="text" id="sentence_1" class="form-control" placeholder="Enter the first sentence" required>
        </div>
        <div class="mb-3">
          <label for="sentence_2" class="form-label">Sentence 2</label>
          <input type="text" id="sentence_2" class="form-control" placeholder="Enter the second sentence" required>
        </div>
        <button type="button" class="btn btn-primary" onclick="getPrediction()">Get Prediction</button>
        <div id="predictionResult" class="result-container mt-3 d-none"></div>
      </div>
    </div>

    <!-- Compute Most Similar Sentence -->
    <div class="card shadow-sm">
      <div class="card-body">
        <h2 class="h5">Compute Most Similar Sentence</h2>
        <div class="mb-3">
          <label for="sentence_query" class="form-label">Query Sentence</label>
          <input type="text" id="sentence_query" class="form-control" placeholder="Enter a query sentence" required>
        </div>
        <div class="mb-3">
          <label for="sentence_dataset" class="form-label">Dataset (One sentence per line)</label>
          <textarea id="sentence_dataset" class="form-control" rows="5"
            placeholder="Enter sentences separated by new lines"></textarea>
        </div>
        <button type="button" class="btn btn-primary" onclick="getMostSimilarSentence()">Get Most Similar
          Sentence</button>
        <div id="mostSimilarityResult" class="result-container mt-3 d-none"></div>
      </div>
    </div>

    <!-- Search Similar Sentences -->
    <div class="card shadow-sm">
      <div class="card-body">
        <h2 class="h5">Search Similar Sentences</h2>
        <div class="mb-3">
          <label for="required_sentence" class="form-label">Sentence to Search</label>
          <input type="text" id="required_sentence" class="form-control"
            placeholder="Enter a sentence to find similar ones" required>
        </div>
        <button type="button" class="btn btn-primary" onclick="getSimilarSentences()">Get Similar Sentences</button>
        <div id="predictionResultForSearch" class="result-container mt-3 d-none"></div>
      </div>
    </div>
  </div>

  <!-- Bootstrap Bundle with Popper -->
  <!-- <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script> -->

  <script>
    async function getPrediction() {
      const sentence_1 = document.getElementById('sentence_1').value;
      const sentence_2 = document.getElementById('sentence_2').value;

      const resultDiv = document.getElementById('predictionResult');
      resultDiv.classList.add('d-none');
      resultDiv.innerText = "Loading...";

      const response = await fetch('/getprediction', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ sentence_1, sentence_2 })
      });

      const data = await response.json();
      resultDiv.classList.remove('d-none');
      resultDiv.innerText = `Similarity: ${data.prediction}%`;
    }

    async function getMostSimilarSentence() {
      const sentence_query = document.getElementById('sentence_query').value;
      const sentence_dataset = document.getElementById('sentence_dataset').value.split('\\n');

      const resultDiv = document.getElementById('mostSimilarityResult');
      resultDiv.classList.add('d-none');
      resultDiv.innerText = "Loading...";

      const response = await fetch('/getmostsimilarsentence', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ sentence_query, sentence_dataset })
      });

      const data = await response.json();
      resultDiv.classList.remove('d-none');
      resultDiv.innerText = `Most Similar Sentence: ${data.most_similar_sentence}`;
    }

    async function getSimilarSentences() {
      const required_sentence = document.getElementById('required_sentence').value;

      const resultContainer = document.getElementById('predictionResultForSearch');
      resultContainer.classList.add('d-none');
      resultContainer.innerHTML = "Loading...";

      const response = await fetch('/getsimilarsentences', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ required_sentence })
      });

      const data = await response.json();
      resultContainer.classList.remove('d-none');
      resultContainer.innerHTML = '';

      const ol = document.createElement('ol');
      data.similar_sentences.forEach(sentence => {
        const li = document.createElement('li');
        li.textContent = sentence;
        ol.appendChild(li);
      });
      resultContainer.appendChild(ol);
    }
    console.log("Script loaded");
    console.log("getPrediction function loaded");

  </script>
</body>

</html>
                  '''

In [32]:
from flask import Flask, request, jsonify
from pyngrok import ngrok

# Khởi tạo Flask app
app = Flask(__name__)

# Mở một ngrok tunnel và tạo URL công cộng cho Flask app
ngrok.set_auth_token("2EFKzjuZu7fVrLtjfCDWaEb1Lkz_pDEkWPKv8JS8Uqvtcsd")
public_url = ngrok.connect(5000)
print(f"Ngrok tunnel URL: {public_url}")

@app.route('/')
def home():
    return html_content

@app.route('/getprediction', methods=['POST'])
def get_prediction():
    # Lấy dữ liệu từ request
    data = request.get_json()
    sentence_1 = data['sentence_1']
    sentence_2 = data['sentence_2']

    similarity = compute_sentence_similarity(sentence_1, sentence_2)

    # Trả về kết quả dưới dạng JSON (ở đây giả sử dự đoán dựa trên chiều cao)
    return jsonify({'prediction': similarity})  # Ví dụ tính toán

@app.route('/getsimilarsentences', methods=['POST'])
def get_similar_sentences():
    data = request.get_json()
    sentence = data['required_sentence']
    similarity_list = get_top_k_similarities(sentence, embeddings, sentences)
    return jsonify({'similar_sentences': similarity_list})

@app.route('/getmostsimilarsentence', methods=['POST'])
def get_most_similar_sentence():
    data = request.get_json()
    sentence_query = data['sentence_query']
    sentence_dataset = data['sentence_dataset']
    most_similar_sentence = find_most_similar(sentence_query, sentence_dataset)
    return jsonify({'most_similar_sentence': most_similar_sentence})

if __name__ == "__main__":
    # Chạy Flask trên cổng 5000
    app.run(port=5000)


Ngrok tunnel URL: NgrokTunnel: "https://0e9c-34-106-123-145.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [29/Nov/2024 08:35:52] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Nov/2024 08:35:53] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


In [12]:
import pickle
import numpy as np

def save_embeddings_with_pickle_batch(sentences, model, batch_size=64, sentences_filename='sentences.txt', embeddings_filename='embeddings.pkl'):
    """
    Lưu các câu vào file text và embeddings vào file pickle với xử lý theo batch.

    Args:
    - sentences: mảng các câu cần tính embedding.
    - model: mô hình để tính embedding.
    - batch_size: kích thước batch khi tính embedding.
    - sentences_filename: tên file text để lưu trữ các câu.
    - embeddings_filename: tên file pickle để lưu trữ embeddings.
    """
    embeddings = []

    # Xử lý các câu theo từng batch
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]  # Lấy batch
        batch_tokenized = tokenize(batch_sentences)  # Tokenize batch
        batch_embeddings = model(batch_tokenized, training=False)[:, 0, :].numpy()  # Lấy embeddings cho batch
        embeddings.append(batch_embeddings)  # Thêm embeddings của batch vào list

        print(f"Processed batch {i // batch_size + 1}/{(len(sentences) - 1) // batch_size + 1}")

    # Chuyển list embeddings thành mảng numpy
    embeddings_array = np.concatenate(embeddings, axis=0)

    # Lưu các câu vào file text
    with open(sentences_filename, 'w') as f:
        for sentence in sentences:
            f.write(sentence + '\n')

    # Lưu mảng embeddings vào file pickle
    with open(embeddings_filename, 'wb') as file:
        pickle.dump(embeddings_array, file)

    print(f"Sentences saved to {sentences_filename}")
    print(f"Embeddings saved to {embeddings_filename}")
