In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [2]:
!pip install cohere

Collecting cohere
  Downloading cohere-5.19.0-py3-none-any.whl.metadata (3.4 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.12.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (5.8 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.4.20250913-py3-none-any.whl.metadata (2.0 kB)
Downloading cohere-5.19.0-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.0/303.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading fastavro-1.12.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_requests-

In [3]:
import cohere
co = cohere.ClientV2("WuOIVkz19Iv8kNTzDDce4dYxUHIWuiQPqTw7hCI1") # Get your free API key: https://dashboard.cohere.com/api-keys

In [4]:
# Load the dataset to a dataframe
df_orig = pd.read_csv('https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/atis_intents_train.csv', names=['intent','query'])

# Take a small sample for illustration purposes
sample_classes = ['atis_airfare', 'atis_airline', 'atis_ground_service']
df = df_orig.sample(frac=0.1, random_state=30)
df = df[df.intent.isin(sample_classes)]
df_orig = df_orig.drop(df.index)
df.reset_index(drop=True,inplace=True)

# Remove unnecessary column
intents = df['intent'] #save for a later need
df.drop(columns=['intent'], inplace=True)

In [13]:
def get_embeddings(texts, model='embed-english-v3.0', input_type="search_document"):
    response = co.embed(
        model=model,
        input_type=input_type,
        texts=texts
    )
    return response.embeddings.float  # ✅ cách mới

In [14]:
df['query_embeds'] = get_embeddings(df['query'].tolist())

In [18]:
embeds = np.array(df['query_embeds'].tolist())

In [15]:
print(len(df))                        # Số dòng DataFrame
print(len(df['query_embeds']))        # Phải khớp số dòng
print(type(df['query_embeds'][0]))    # <class 'list'>
print(len(df['query_embeds'][0]))     # Chiều embedding (thường 1024)

91
91
<class 'list'>
1024


In [16]:
# Define new query
new_query = "How can I find a taxi or a bus when the plane lands?"

# Get embeddings of the new query
new_query_embeds = get_embeddings([new_query], input_type="search_query")[0]

In [22]:
similarity = get_similarity(new_query_embeds, embeds)

In [23]:
# Calculate cosine similarity between the search query and existing queries
def get_similarity(target, candidates):
    # Turn list into array
    candidates = np.array(candidates)
    target = np.expand_dims(np.array(target),axis=0)

    # Calculate cosine similarity
    sim = cosine_similarity(target, candidates)
    sim = np.squeeze(sim).tolist()
    sort_index = np.argsort(sim)[::-1]
    sort_score = [sim[i] for i in sort_index]
    similarity_scores = zip(sort_index,sort_score)

    # Return similarity scores
    return similarity_scores

# Get the similarity between the search query and existing queries
similarity = get_similarity(new_query_embeds, embeds[:sample])

In [24]:
# View the top 5 articles
print('Query:')
print(new_query,'\n')

print('Most Similar Documents:')
for idx, sim in similarity:
    print(f'Similarity: {sim:.2f};', df.iloc[idx]['query'])

Query:
How can I find a taxi or a bus when the plane lands? 

Most Similar Documents:
Similarity: 0.37;  show me a list of ground transportation at boston airport
Similarity: 0.36;  what ground transportation is available in boston
Similarity: 0.33;  show me boston ground transportation
Similarity: 0.27;  show me the airlines that fly between toronto and denver
Similarity: 0.27;  which airlines fly between boston and pittsburgh
Similarity: 0.25;  which airlines fly from boston to washington dc via other cities
Similarity: 0.24;  of all airlines which airline has the most arrivals in atlanta
Similarity: 0.18;  i'd like the lowest fare from denver to pittsburgh
Similarity: 0.17;  show me round trip first class tickets from new york to miami
Similarity: 0.17;  i would like your rates between atlanta and boston on september third


CLUSTERING

In [25]:
# Embed the text for clustering
df['clustering_embeds'] = get_embeddings(df['query'].tolist(), input_type="clustering")
embeds = np.array(df['clustering_embeds'].tolist())

In [27]:
# Perform PCA for dimensionality reduction before clustering
pca = PCA(n_components=2)
embeds_pca = pca.fit_transform(embeds)

# Store the PCA results in a new DataFrame
df_pc2 = pd.DataFrame(embeds_pca, columns=['pc1', 'pc2'])

In [28]:
# Pick the number of clusters
n_clusters = 2

# Cluster the embeddings
kmeans_model = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
classes = kmeans_model.fit_predict(embeds_pca).tolist()

# Store the cluster assignments
df_clust = df_pc2.copy()
df_clust['cluster'] = (list(map(str,classes)))

In [29]:
# Visualize the clusters using Altair
chart = alt.Chart(df_clust).mark_circle(size=60).encode(
    x='pc1',
    y='pc2',
    color='cluster',
    tooltip=['pc1', 'pc2', 'cluster']
).properties(
    title='Query Clusters (PCA reduced to 2 components)'
).interactive()

chart.display()