### Part 1:
I've chosen Qwen3-embedding-0.6B because Qwen3-embedding-8B achieved state-of-art performance and Qwen3-embedding-0.6B showed outstanding results for small embedder model.
https://youtu.be/hize6rD6Afk?si=kjCVotpzqbECXFfi - video that confirms my opinion

In [None]:
from llama_cpp import Llama
import numpy as np
import pandas as pd

llm = Llama.from_pretrained(
  repo_id="Qwen/Qwen3-Embedding-0.6B-GGUF",
  filename="Qwen3-Embedding-0.6B-Q8_0.gguf",
  embedding=True
)

### Part 2:
Selecting needed data

In [None]:
data = pd.read_csv('netflix_reviews.csv')
data = data[['content','score']]

prep_data = data.groupby("score").apply(lambda x: x.sample(400, random_state=42)).reset_index(drop=True)

# Making sure that our data is uniformly distributed
print(prep_data['score'].value_counts())

### Part 3:
Adding new column with embedding retrieved from the Qwen3

In [None]:
prep_data['embedders'] = prep_data['content'].apply(lambda x: llm.create_embedding(x)['data'][0]['embedding'][0])

### Part 4:
Implementing minkowski distance

In [None]:
#I used minkowski distance because if the parameter(p)
# p = 2 - it becomes Euclidian distance
# p = 1 - it becomes Manhattan distance
# p = 1.5 - assume to be value of p for Minkowski distance
 
def minkowski_dist(x:list, y:list, p:int = 1.5):
    if p < 1:
        raise ValueError(f"Parameter p must be greater than 1 (p={p})")
    if len(x) != len(y):
        raise ValueError(f"Parameters x and y must be same size (x={len(x)}, y={len(y)})")
    
    distance = 0
    
    for i in range(len(x)):
        distance += np.pow(np.abs(x[i] - y[i]), p)
    
    return np.pow(distance, 1/p)

# # Checking the correctness of the function (should be 5 since it Egypt triangle)
# print(minkowski_dist([0, 0], [3, 4], 2))

Implementing function that will perform k-means++ clusterization

In [None]:
def k_means(k: int, data: list, dist_p: int, max_iterations=100):
    
    #initializing center points via k-means++
    centers = [data[np.random.randint(len(data))].copy()]
        
    for _ in range(1, k):
        distances = []
        for point in data:
            min_dist = min(minkowski_dist(point, center, dist_p) for center in centers)
            distances.append(min_dist)
        
        probabilities = np.array(distances) / np.sum(distances)
        next_center_idx = np.random.choice(len(data), p=probabilities)
        centers.append(data[next_center_idx].copy())
    
    #making groups for points of different clusters
    groups = [[] for _ in range(k)]
    
    #calculating the distance and assigning points to the corresponding group with the closest center 
    for point in data:
        distances = [minkowski_dist(center, point, dist_p) for center in centers]
        closest_center_idx = np.argmin(distances)
        groups[closest_center_idx].append(point)
    
    
    iteration = 0
    while iteration < max_iterations:
        
        #finding new center as the mean of all points assigned to it's group
        new_centers = []
        for i in range(k):
            if len(groups[i]) == 0:
                new_centers.append(centers[i])
            else:
                new_center = []
                for dim in range(len(centers[0])):
                    new_center.append(np.mean(np.array(groups[i])[:, dim]))
                new_centers.append(new_center)
        
        #reassembling the group with new centers
        new_groups = [[] for _ in range(k)]
        for point in data:
            distances = [minkowski_dist(center, point, dist_p) for center in new_centers]
            closest_center_idx = np.argmin(distances)
            new_groups[closest_center_idx].append(point)
        
        if groups == new_groups:
            print("Ended with no changes in groups")
            break
            
        centers, groups = new_centers, new_groups
        iteration += 1
    
    return centers, groups

Retrieving the results of clusterization with different distance metrics

In [None]:
results = {
    "euclidian":tuple(k_means(5, prep_data['embedders'], 2)),
    "minkowski":tuple(k_means(5, prep_data['embedders'], 1.5)),
    "manhattan":tuple(k_means(5, prep_data['embedders'], 1))
}

In [None]:
print(results['euclidian'])
print(results['minkowski'])
print(results['manhattan'])