| Month | Region | Product | Revenue | Cost  | Profit |
| ----- | ------ | ------- | ------- | ----- | ------ |
| Jan   | US     | A       | 120000  | 90000 | 30000  |
| Jan   | EU     | A       | 95000   | 70000 | 25000  |
| Jan   | US     | B       | 60000   | 50000 | 10000  |
| Feb   | US     | A       | 130000  | 92000 | 38000  |
| Feb   | EU     | B       | 70000   | 68000 | 2000   |


In [1]:
import pandas as pd

data = {
    "Month":   ["Jan", "Jan", "Jan", "Feb", "Feb"],
    "Region":  ["US", "EU", "US", "US", "EU"],
    "Product": ["A", "A", "B", "A", "B"],
    "Revenue": [120000, 95000, 60000, 130000, 70000],
    "Cost":    [90000, 70000, 50000, 92000, 68000],
    "Profit":  [30000, 25000, 10000, 38000, 2000],
}

df = pd.DataFrame(data)
df


Unnamed: 0,Month,Region,Product,Revenue,Cost,Profit
0,Jan,US,A,120000,90000,30000
1,Jan,EU,A,95000,70000,25000
2,Jan,US,B,60000,50000,10000
3,Feb,US,A,130000,92000,38000
4,Feb,EU,B,70000,68000,2000


In [2]:
rows = [
    "Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD",
    "Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD",
    "Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD",
    "Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD",
    "Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD",
]

BAAI / Beijing Academy of AI

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("BAAI/bge-large-en-v1.5")

row_embeddings = model.encode(rows, normalize_embeddings=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
row_embeddings.shape

(5, 1024)

In [5]:
query = "Which product in US had the high profit?"
query_embedding = model.encode(query, normalize_embeddings=True)

In [6]:
query_embedding.shape

(1024,)

In [7]:
def cosine_sim(a, b):
    return np.dot(a, b)

In [8]:
def get_semantic_similarity_results(query_embedding):
    scores = []

    for i, emb in enumerate(row_embeddings):
        score = cosine_sim(query_embedding, emb)
        scores.append((rows[i], score))

    scores.sort(key=lambda x: x[1], reverse=True)

    for row, score in scores:
        print(round(score, 3), row)

In [9]:
get_semantic_similarity_results(query_embedding)

0.649 Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD
0.648 Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD
0.638 Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD
0.611 Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD
0.597 Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD


In [10]:
query = "Which product in EU had the high profit?"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.705 Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD
0.7 Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD
0.629 Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD
0.625 Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD
0.623 Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD


In [11]:
query = "Total profit in EU?"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.724 Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD
0.723 Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD
0.665 Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD
0.664 Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD
0.663 Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD


In [12]:
query = "Total profit in Feb"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.717 Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD
0.716 Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD
0.689 Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD
0.687 Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD
0.652 Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD


In [13]:
query = "Min revenue product in Jan"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.769 Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD
0.759 Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD
0.748 Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD
0.697 Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD
0.687 Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD


In [None]:
query = "Product with profit < 25K USD"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.764 Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD
0.754 Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD
0.746 Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD
0.734 Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD
0.73 Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD


In [16]:
query = "Product with profit < 25 USD"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.76 Month Jan, Region EU, Product A, Revenue 95000 USD, Cost 70000 USD, Profit 25000 USD
0.751 Month Feb, Region US, Product A, Revenue 130000 USD, Cost 92000 USD, Profit 38000 USD
0.745 Month Jan, Region US, Product B, Revenue 60000 USD, Cost 50000 USD, Profit 10000 USD
0.737 Month Feb, Region EU, Product B, Revenue 70000 USD, Cost 68000 USD, Profit 2000 USD
0.725 Month Jan, Region US, Product A, Revenue 120000 USD, Cost 90000 USD, Profit 30000 USD


### Text Data in Table

| Ticket_ID | Product    | Issue_Type     | Description                                     |
| --------- | ---------- | -------------- | ----------------------------------------------- |
| T001      | Mobile App | Login Issue    | User cannot log in after resetting password     |
| T002      | Web App    | Performance    | Dashboard loads very slowly during peak hours   |
| T003      | Mobile App | Crash          | App crashes when uploading large images         |
| T004      | Web App    | UI Bug         | Submit button is not visible on Firefox browser |
| T005      | API        | Authentication | API returns 401 error for valid API keys        |


In [17]:
rows = [
    "Ticket T001 about Mobile App login issue. User cannot log in after resetting password.",
    "Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.",
    "Ticket T003 about Mobile App crash issue. App crashes when uploading large images.",
    "Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.",
    "Ticket T005 about API authentication issue. API returns 401 error for valid API keys."
]


In [18]:
row_embeddings = model.encode(rows, normalize_embeddings=True)

In [19]:
row_embeddings.shape

(5, 1024)

In [20]:
query = "All tickets related to web app"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.637 Ticket T005 about API authentication issue. API returns 401 error for valid API keys.
0.636 Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.
0.613 Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.
0.588 Ticket T003 about Mobile App crash issue. App crashes when uploading large images.
0.563 Ticket T001 about Mobile App login issue. User cannot log in after resetting password.


In [21]:
query = "All tickets for mobile"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.607 Ticket T003 about Mobile App crash issue. App crashes when uploading large images.
0.595 Ticket T001 about Mobile App login issue. User cannot log in after resetting password.
0.554 Ticket T005 about API authentication issue. API returns 401 error for valid API keys.
0.525 Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.
0.496 Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.


In [22]:
query = "which ticket relate to loading?"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.673 Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.
0.64 Ticket T005 about API authentication issue. API returns 401 error for valid API keys.
0.618 Ticket T003 about Mobile App crash issue. App crashes when uploading large images.
0.581 Ticket T001 about Mobile App login issue. User cannot log in after resetting password.
0.536 Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.


In [23]:
query = "User can't signin"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.674 Ticket T001 about Mobile App login issue. User cannot log in after resetting password.
0.568 Ticket T005 about API authentication issue. API returns 401 error for valid API keys.
0.485 Ticket T003 about Mobile App crash issue. App crashes when uploading large images.
0.463 Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.
0.454 Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.


In [24]:
query = "All tickets about application latency"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.687 Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.
0.622 Ticket T005 about API authentication issue. API returns 401 error for valid API keys.
0.608 Ticket T003 about Mobile App crash issue. App crashes when uploading large images.
0.575 Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.
0.553 Ticket T001 about Mobile App login issue. User cannot log in after resetting password.


In [25]:
query = "All tickets about memory issues"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.677 Ticket T003 about Mobile App crash issue. App crashes when uploading large images.
0.653 Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.
0.6 Ticket T005 about API authentication issue. API returns 401 error for valid API keys.
0.57 Ticket T001 about Mobile App login issue. User cannot log in after resetting password.
0.544 Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.


In [26]:
query = "Any ticket with >400 status error?"
query_embedding = model.encode(query, normalize_embeddings=True)

get_semantic_similarity_results(query_embedding)

0.739 Ticket T005 about API authentication issue. API returns 401 error for valid API keys.
0.676 Ticket T003 about Mobile App crash issue. App crashes when uploading large images.
0.657 Ticket T002 about Web App performance issue. Dashboard loads very slowly during peak hours.
0.646 Ticket T001 about Mobile App login issue. User cannot log in after resetting password.
0.585 Ticket T004 about Web App UI bug. Submit button is not visible on Firefox browser.
