In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [6]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True) # use_fast=True?
# tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████| 237/237 [00:00<00:00, 232kB/s]
vocab.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 798k/798k [00:00<00:00, 3.10MB/s]
merges.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 5.89MB/s]
tokenizer.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 2.11M/2.11M [00:00<00:00, 9.90MB/s]
added_tokens.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1.08k/1.08k [00:00<00:00, 1.68MB/s]
special_tokens_map.json: 100%|████████████████████████████████████████████████████████████████████████████████████| 99.0/99.0 [00:00<00:00, 237kB/s]


In [12]:
vocab = tokenizer.get_vocab()

In [13]:
len(vocab)

50295

In [26]:
vocab["def"]

4299

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # torch_dtype=compute_dtype,
    # quantization_config=bnb_config,
)

config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 864/864 [00:00<00:00, 1.40MB/s]
configuration_phi.py: 100%|████████████████████████████████████████████████████████████████████████████████████| 9.26k/9.26k [00:00<00:00, 16.1MB/s]
modeling_phi.py: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 62.7k/62.7k [00:00<00:00, 38.3MB/s]
pytorch_model.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████| 2.84G/2.84G [03:21<00:00, 14.0MB/s]
generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████| 74.0/74.0 [00:00<00:00, 265kB/s]


In [None]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto", trust_remote_code=True)

In [None]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

In [None]:
# Clustering function

from sklearn.cluster import KMeans
import numpy as np

def cluster(data_dict, k, option='raw'):

    keys = np.array(list(data_dict.keys()))
    vectors = np.array(list(data_dict.values()))

    n = keys.size()

    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(all_vectors)

    labels = kmeans.labels_
    centers = kmeans.cluster_centers_

    inertia = kmeans.inertia_
    avg_squared_distance = inertia / n
    sqrt_of_avg_squared_distance = np.sqrt(avg_squared_distance) # Standard deviation in a way

    avg_distance = np.sqrt((vectors - centers[labels]) ** 2).sum(axis=1) / n

    print(f"Average distance to cluster: {avg_distance}")
    print(f"Average squared distance: {avg_squared_distance}")
    print(f"Sqrt of average squared distance: {sqrt_of_avg_squared_distance}")


In [2]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

def elbow(data_dict, k_range):

    vectors = np.array(list(data_dict.values()))

    inertias_normalized = []
    avg_squared_distances = []

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(vectors)

        n = vectors.shape[0]

        inertia_normalized = kmeans.inertia_ / n
        inertias_normalized.append(inertia_normalized)

    plt.figure(figsize=(10, 6))
    plt.plot(k_range, inertias_normalized, '-o', label='Inertia')
    plt.plot(k_range, avg_squared_distances, '-s', label='Average Squared Distance')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Inertia / Average Squared Distance')
    plt.title('Elbow Method For Optimal k')
    plt.legend()
    plt.xticks(k_range)
    plt.grid(True)
    plt.show()

    

In [5]:
def stats(data_dict, k):

    vectors = np.array(list(data_dict.values()))
    keys = np.array(list(data_dict.keys()))

    n = keys.size()

    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(vectors)

    # R^2

    inertia = kmeans.inertia_
    inertia_avg = inertia / n

    global_average = vectors.sum(axis=1) / n
    global_distance_squared_avg = np.sum(((vectors - global_average)**2)) / n

    R2 = 1 - inertia_avg / global_distance_squared_avg
    # print(f'R^2: {R2}')

    # Average importance distribution

    average_importance_dist = vectors.sum(axis=1) / n

    # Average ordered importance distribution
    sorted_vectors = np.sort(vectors, axis=1)[:, ::-1]
    average_ordered_importance_dist = sorted_vectors.sum(axis=1) / n

    return [R2, average_importance_dist, average_ordered_importance_dist]







In [7]:
# Generate random data

n_data = 1000
vec_dim = 819

random_data_dict = {str(i): np.random.rand(vec_dim) for i in range(n_data)}

# Verify the size of the dictionary and the dimensionality of a sample vector
dict_size = len(random_data_dict)
sample_vector_dimension = random_data_dict['0'].shape[0]

dict_size, sample_vector_dimension

(1000, 819)

In [11]:
k_list = np.arange(10, 501, 10)

elbow(random_data_dict, k_list)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


KeyboardInterrupt: 