In [None]:
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding

In [None]:
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [None]:
input_datapath = r"Sheet1.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath)
df = df[["text", "intent"]]
df = df.dropna()

In [None]:
encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

In [None]:
df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("embedded.csv")

In [None]:
import numpy as np
import pandas as pd

# load data
datafile_path = pd.read_csv(r"embedded.csv")

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(eval).apply(np.array)  
matrix = np.vstack(df.embedding.values)
matrix.shape

In [None]:
from sklearn.cluster import KMeans

n_clusters = len(np.unique(df["intent"]))

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
centers = kmeans.cluster_centers_
df["Cluster"] = labels
# df["Centers"] = centers
# df.groupby("Cluster").Score.mean().sort_values()

In [None]:
len(centers[0])

In [None]:
import random
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="pca", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)

plt.figure(figsize=(20,16))
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

colors = []
for i in range(len(set(df.Cluster))):
    colors.append('#'+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))

for i, color in enumerate(colors):
    xs = np.array(x)[df.Cluster == i]
    ys = np.array(y)[df.Cluster == i]
    plt.scatter(xs, ys, color=color, alpha=0.3)

    avg_x = xs.mean()
    avg_y = ys.mean()
plt.scatter(avg_x, avg_y, color=color, s=10)
plt.title("Clusters identified visualized in language 2d using t-SNE")
plt.show()
