In [1]:
import openai
import json
import numpy as np
from numpy.linalg import norm
import requests
import csv
import tiktoken
from pprint import pprint
from collections import defaultdict
api_key = open("api_key").read()
openai.api_key = api_key
def get_embedding(text, model="text-embedding-ada-002"):
   encoder = tiktoken.encoding_for_model(model)
   token_length = len(encoder.encode(text))
   while token_length > 8191:
      text = text[:-100]
      token_length = len(encoder.encode(text))
      print(token_length)
   # text = text.replace("\n", " ")
   # return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
   url = 'https://api.openai.com/v1/embeddings'
   headers = {
      'Content-Type': 'application/json',
      'Authorization': "Bearer {}".format(api_key)
   }
   data = {
      "input": text,
      "model": model
   }
   res = requests.post(url, headers=headers, json=data)
   res = res.json()
   return res['data'][0]['embedding']

def cos_sim(a, b):
   return np.dot(a, b)/(norm(a)*norm(b))

def save_json(data, filepath=r'new_data.json'):
   with open(filepath, 'w') as fp:
      json.dump(data, fp, indent=4)

In [None]:
import json
from collections import defaultdict
pairwise_evaluation = json.load(open('pairwise_evaluation_w_embeddings.json'))
pairwise_evaluation_dict = defaultdict(list)
for item in pairwise_evaluation:
    pairwise_evaluation_dict[item['article_id']].append(item)

In [None]:
from pprint import pprint
writer_samples = {}
for article_id, data in pairwise_evaluation_dict.items():
    writer_samples[article_id] = data[0]
save_json(writer_samples, 'pairwise_samples.json')


In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from scipy import spatial
import math 
def compute_transformation(X, Y):
    X = np.array(X)
    Y = np.array(Y)
    # A = X / Y # Does not work
    # A = np.tanh(X) / Y # Does not work
    # X = np.array([1/(1+math.exp(-x)) for x in X]) # Does not work
    # X = np.array([math.exp(-x**2) for x in X]) # Does not work
    
    # A = X / Y 
    A = X - Y
    # print((np.min(X), np.max(X)), (np.min(Y), np.max(Y)))
    
    assert(X.shape == A.shape)
    return A

def _compute_transformation(X, Y):
    X = np.array(X)
    Y = np.array(Y)
    A = np.concatenate((X, Y))
    print(A.shape)
    return A

def k_means(X, k):
    X = np.array(X)
    # Apply k-means clustering
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)

    # Get the cluster centers (representative matrices)
    cluster_centers = kmeans.cluster_centers_

    # Assign each matrix to its nearest cluster
    labels, _ = pairwise_distances_argmin_min(X, cluster_centers)

    # Print the cluster assignments for each matrix
    # for i, label in enumerate(labels):
    #     print(f"Matrix {i + 1} is assigned to Cluster {label + 1}")
    return labels

def cos_distance_matrix(X):
    D = np.zeros((X.shape[0], X.shape[0]))
    for i in range(X.shape[0]):
        for j in range(X.shape[0]):
            print(i, j, X.shape[0])
            D[i][j] = spatial.distance.cosine(X[i], X[j])
    return D

def labels_to_clusters(labels):
    clusters = defaultdict(list)
    for index, label in enumerate(labels):
        clusters[label].append(index)
    return clusters

In [None]:
pairwise_evaluation = json.load(open('pairwise_samples.json'))
better_summary_embeddings = [sample['writer_summary_embedding'] if sample['overall_writer_better'] else sample['llm_summary_embedding'] for sample in pairwise_evaluation.values() ]
worse_summary_embeddings = [sample['writer_summary_embedding'] if not sample['overall_writer_better'] else sample['llm_summary_embedding'] for sample in pairwise_evaluation.values() ]
better_transformations = []
worse_transformations = []
for sample in pairwise_evaluation.values():
    # A1, residuals = estimate_transformation(sample['full_embedding'], sample['llm_summary_embedding'])
    # transformations.append(A1)
    A2 = _compute_transformation(sample['full_embedding'], sample['writer_summary_embedding'])
    transformations.append(A2)
print(len(transformations))

In [3]:
pairwise_evaluation = json.load(open('pairwise_samples.json'))
better_summary_embeddings = [(sample['full_embedding'], sample['writer_summary_embedding']) if sample['overall_writer_better'] else (sample['full_embedding'], sample['llm_summary_embedding']) for sample in pairwise_evaluation.values() ]
worse_summary_embeddings = [(sample['full_embedding'], sample['writer_summary_embedding']) if not sample['overall_writer_better'] else (sample['full_embedding'], sample['llm_summary_embedding']) for sample in pairwise_evaluation.values() ]
better_transformations = [compute_transformation(full_embedding, summary_embedding) for full_embedding, summary_embedding in better_summary_embeddings]
worse_transformations = [compute_transformation(full_embedding, summary_embedding) for full_embedding, summary_embedding in worse_summary_embeddings]
better_transformation_kmeans = k_means(np.array(better_transformations), 5)
worse_transformation_kmeans = k_means(np.array(worse_transformations), 5)
better_transformation_clusters = labels_to_clusters(better_transformation_kmeans)
worse_transformation_clusters = labels_to_clusters(worse_transformation_kmeans)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [4]:
pprint(better_transformation_clusters)

defaultdict(<class 'list'>,
            {0: [0,
                 1,
                 3,
                 4,
                 6,
                 7,
                 10,
                 12,
                 13,
                 15,
                 16,
                 17,
                 18,
                 19,
                 22,
                 23,
                 25,
                 27,
                 28,
                 29,
                 31,
                 32,
                 33,
                 34,
                 39,
                 40,
                 42,
                 44,
                 46,
                 47,
                 48,
                 51,
                 53,
                 54,
                 56,
                 60,
                 61,
                 62,
                 65,
                 67,
                 68,
                 71,
                 72,
                 75],
             1: [49],
             2: [2,
           

In [5]:
pprint(worse_transformation_clusters)

defaultdict(<class 'list'>,
            {0: [23, 25, 41],
             1: [1,
                 5,
                 8,
                 10,
                 12,
                 19,
                 20,
                 24,
                 26,
                 29,
                 30,
                 35,
                 36,
                 39,
                 42,
                 43,
                 44,
                 45,
                 47,
                 51,
                 52,
                 56,
                 57,
                 58,
                 62,
                 64,
                 65,
                 72],
             2: [0,
                 2,
                 3,
                 4,
                 6,
                 7,
                 9,
                 11,
                 13,
                 14,
                 15,
                 16,
                 17,
                 18,
                 21,
                 22,
                 27,
      

In [22]:
transformation_kmeans = k_means(np.array(transformations), 5)
print("transformation kmeans done")
full_kmeans = k_means(np.array([sample['full_embedding'] for sample in pairwise_evaluation.values()]), 5)
print("full kmeans done")
writer_summary_kmeans = k_means(np.array([sample['writer_summary_embedding'] for sample in pairwise_evaluation.values()]), 5)
print("writer summary kmeans done")

  super()._check_params_vs_input(X, default_n_init=10)


transformation kmeans done
full kmeans done
writer summary kmeans done


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [23]:
transformation_clusters = labels_to_clusters(transformation_kmeans)
full_clusters = labels_to_clusters(full_kmeans)
writer_summary_clusters = labels_to_clusters(writer_summary_kmeans)

In [24]:
pprint(transformation_clusters)

defaultdict(<class 'list'>,
            {0: [16, 28, 29, 43, 45, 56],
             1: [0, 23, 25, 26, 33, 39, 41, 44, 50, 51, 53, 65, 66],
             2: [2,
                 12,
                 13,
                 20,
                 21,
                 24,
                 30,
                 34,
                 37,
                 38,
                 42,
                 49,
                 52,
                 58,
                 68,
                 71,
                 72,
                 75],
             3: [1,
                 3,
                 6,
                 7,
                 10,
                 11,
                 17,
                 18,
                 19,
                 22,
                 27,
                 31,
                 32,
                 36,
                 40,
                 47,
                 48,
                 54,
                 61,
                 69,
                 70,
                 73],
             4: [4,
    

In [9]:
pprint(full_clusters)

defaultdict(<class 'list'>,
            {0: [4, 32, 35, 46, 54, 59, 65],
             1: [0,
                 5,
                 8,
                 9,
                 23,
                 25,
                 28,
                 33,
                 39,
                 41,
                 45,
                 50,
                 51,
                 53,
                 60,
                 63,
                 66,
                 67,
                 74],
             2: [3, 7, 10, 17, 18, 22, 27, 31, 36, 40, 43, 44, 57, 62, 64, 73],
             3: [2,
                 12,
                 20,
                 21,
                 24,
                 26,
                 30,
                 34,
                 37,
                 38,
                 42,
                 49,
                 52,
                 58,
                 68,
                 71,
                 72,
                 75],
             4: [1, 6, 11, 13, 14, 15, 16, 19, 29, 47, 48, 55, 56, 61, 69

In [16]:
pprint(writer_summary_clusters)

defaultdict(<class 'list'>,
            {0: [2, 12, 13, 21, 24, 34, 35, 37, 38, 42, 71],
             1: [0,
                 5,
                 8,
                 9,
                 15,
                 16,
                 23,
                 25,
                 28,
                 33,
                 39,
                 41,
                 44,
                 50,
                 51,
                 53,
                 60,
                 63,
                 66,
                 67,
                 74],
             2: [4, 6, 11, 14, 19, 22, 26, 43, 45, 46, 55, 57, 59, 65, 69],
             3: [18, 20, 30, 36, 49, 52, 58, 68, 72, 75],
             4: [1,
                 3,
                 7,
                 10,
                 17,
                 27,
                 29,
                 31,
                 32,
                 40,
                 47,
                 48,
                 54,
                 56,
                 61,
                 62,
      

In [15]:
print(list(pairwise_evaluation.values())[4]['writer_summary'])
print(list(pairwise_evaluation.values())[32]['writer_summary'])
print(list(pairwise_evaluation.values())[35]['writer_summary'])
print(list(pairwise_evaluation.values())[46]['writer_summary'])
print(list(pairwise_evaluation.values())[54]['writer_summary'])
print(list(pairwise_evaluation.values())[59]['writer_summary'])
print(list(pairwise_evaluation.values())[65]['writer_summary'])

Lizzi, an obese mother of six, wouldn't have realized how big she got until a kid on the bus mistook her for being pregnant. She lived on a junk food booze diet, but after hearing that comment, Lizzi was inspired to lose weight. Since then, both her physical and mental health has significantly improved. 
An England investigation found that at least nine Clinical Commissioning Groups were offering doctors incentives to reduce the number of outpatient referrals and follow-ups. This is in an effort by the NHS to reduce inappropriate referrals and cut health service expenses. 
Mead, an alcoholic beverage made from honey is gaining popularity in the United States. The drink is classified as wine by the Alcohol and Tobacco Tax and Trade Bureau. Mead labels define it as "honey wine", giving the consumer the impression that every mead is sweet, but it can also be flavored with fruits, herbs, and spicy peppers. 
A woman claiming to have undergone repeated plastic surgery after her husband divor

In [None]:
print(list(pairwise_evaluation.values())[16]['writer_summary'])
print(list(pairwise_evaluation.values())[28]['writer_summary'])
print(list(pairwise_evaluation.values())[29]['writer_summary'])
print(list(pairwise_evaluation.values())[43]['writer_summary'])
print(list(pairwise_evaluation.values())[45]['writer_summary'])
print(list(pairwise_evaluation.values())[56]['writer_summary'])
print("==================")
print(list(pairwise_evaluation.values())[2]['writer_summary'])
print(list(pairwise_evaluation.values())[12]['writer_summary'])
print(list(pairwise_evaluation.values())[13]['writer_summary'])
print(list(pairwise_evaluation.values())[20]['writer_summary'])