<a href="https://colab.research.google.com/github/NobShen/llama/blob/main/raovat3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

start new notebook

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load the Universal Sentence Encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Function to encode text in batches
def batch_encode_text(texts, batch_size=32):
    encoded_vectors = []
    num_batches = len(texts) // batch_size + 1

    for i in range(num_batches):
        start = i * batch_size
        end = (i + 1) * batch_size
        batch = texts[start:end]

        # Encode the batch of text
        batch_vectors = embed(batch)
        encoded_vectors.extend(batch_vectors.numpy())

    return np.array(encoded_vectors)

Now load the data from a web page

In [None]:

# URL of the web page containing the text data (replace with your URL)
web_page_url = "https://mydshbrd.com/LICENSE.txt"

# Specify the user-agent
headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20"}

# Send an HTTP GET request to fetch the content with the specified user-agent
response = requests.get(web_page_url, headers=headers)
# print(response.text)

# Parse the HTML content of the web page
soup = BeautifulSoup(response.text, "html.parser")

# Extract the text data from <div> elements (adjust as per your page structure)
text_data = []

for div in soup.find_all("p"):
    text_data.append(div.get_text())

# print(text_data[20])

In [None]:
# Initialize an empty DataFrame to store the text data
df = pd.DataFrame({"Title": text_data})

# Choose the column containing the text data you want to encode
text_column_name = "Title"

# Get the number of rows in your DataFrame
num_rows = len(df)

# Initialize an empty array to store the encoded vectors
encoded_data = np.empty((0, 512))  # Assuming USE model returns vectors of size 512

In [None]:
# Set the batch size based on available memory and performance considerations
batch_size = 99

# Loop through the dataset in batches and encode the text
for start in range(0, num_rows, batch_size):
    end = min(start + batch_size, num_rows)
    batch_texts = df[text_column_name][start:end].tolist()

    # Encode the batch of text
    batch_encoded = batch_encode_text(batch_texts, batch_size=batch_size)

    # Append the encoded batch to the result
    encoded_data = np.vstack((encoded_data, batch_encoded))
    print(encoded_data)

# Now, encoded_data contains the encoded vectors for the text data obtained from the web page


[[-0.01408162 -0.06514768 -0.0631865  ... -0.06899547  0.02465326
  -0.06966335]
 [-0.03210635 -0.0626885  -0.00278336 ... -0.03495861  0.03281365
  -0.0749908 ]
 [ 0.00387092 -0.05773959 -0.00181855 ... -0.01471864 -0.04654043
  -0.08436348]
 ...
 [-0.05163694 -0.07826692 -0.00638289 ... -0.06691641  0.07477357
  -0.08397678]
 [-0.06667349  0.00766938 -0.02188736 ... -0.04404028 -0.0505691
  -0.0318731 ]
 [-0.06949051 -0.06613951  0.03548685 ... -0.00521331 -0.01603429
  -0.08428316]]


In [None]:
# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data)

# Save the DataFrame to a CSV file
encoded_df.to_csv("encoded_data.csv", index=True)

In [None]:
querry = ["Tôi cần nhà ở Anaheim, chấp nhận housing."]
querry_vectors = embed(querry)
# print(querry_vectors)

In [None]:
# Display the path to the saved CSV file
# print(encoded_df)

from sklearn.metrics.pairwise import cosine_similarity
# print(text_data[0])
# Calculate cosine similarity between two vectors
for i in range(len(encoded_data)):
  similarity = cosine_similarity([querry_vectors[0]], [encoded_data[i]])
  if similarity[0][0] > 0.35: print(i, text_data[i])

0  NhÃ  Anaheim gáº§n chÃ¹a DÆ°á»£c SÆ° 3PN/2PT. 1,386sqft má»i Remodeled. Garage 2 xe vá»i Driveway rá»ng. Housing ok.  Khu an ninh sáº¡ch, central AC, Heat. Housing OK. $3,400 thÃ¡ng. 714-726-2429
23  NHÃ HOUSE BOLSA/ MAGNOLIA. Má»i remodeled, gáº§n PhÆ°á»c Lá»c Thá», chá»£ ABC, khu yÃªn tÄ©nh. 4PN/2PT W/Central AC. Dá»n vÃ o liá»n. $3,900/thÃ¡ng (Nháº­n Housing). L/L: TRANG 714-397-2722
28  NhÃ  Anaheim 4PN/2PT, garage 2 xe, Brookhurst, remodel toÃ n diá»n, $3,700/thÃ¡ng + deposit. Check income, credit. Nháº­n housing. Dá»n vÃ o ngay.L/L: Harry 714-315-4789, My Le 714-582-1885
49  ANAHEIM GÃ³c Euclid/ Orangewood. FOURPLEX 2PN/2PT, Patio. HÃ²an tÃ²an tÃ¢n trang, cÃ³ chá» Äá» mÃ¡y giáº·t sáº¥y. NhÃ  lÃ³t gáº¡ch, gá», cÃ³ garage. Nháº­n Housing. L/L: ALLEN 714-335-2222
62  REDLANDS, cÃ¡ch Downtowwn 1 mile, cÃ¡ch San Bernardino 14 miles. Only Housing Section 8, 4-Plex, 3PN/1 or 2PT $2,750/mo, 2PN/2PT $2,350/mo. Bao nÆ°á»c rÃ¡c, L/L: TÃ i 909-362-0610.
