In [28]:
import openai
import json
import numpy as np
from numpy.linalg import norm
import requests
import csv
import tiktoken
from pprint import pprint
api_key = open("api_key").read()
openai.api_key = api_key
def get_embedding(text, model="text-embedding-ada-002"):
   encoder = tiktoken.encoding_for_model(model)
   token_length = len(encoder.encode(text))
   while token_length > 8191:
      text = text[:-100]
      token_length = len(encoder.encode(text))
      print(token_length)
   # text = text.replace("\n", " ")
   # return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
   url = 'https://api.openai.com/v1/embeddings'
   headers = {
      'Content-Type': 'application/json',
      'Authorization': "Bearer {}".format(api_key)
   }
   data = {
      "input": text,
      "model": model
   }
   res = requests.post(url, headers=headers, json=data)
   res = res.json()
   return res['data'][0]['embedding']

def cos_sim(a, b):
   return np.dot(a, b)/(norm(a)*norm(b))

def save_json(data, filepath=r'new_data.json'):
   with open(filepath, 'w') as fp:
      json.dump(data, fp, indent=4)

In [None]:
llm_summ_articles = json.load(open('benchmark_llm_summarization/pairwise_evaluation_results.json'))
length = len(llm_summ_articles)
for index, article in enumerate(llm_summ_articles):
    print("Processing article {} of {}".format(index, length))
    full_embedding = get_embedding(article['article_text'])
    writer_summary_embedding = get_embedding(article['writer_summary'])
    llm_summary_embedding = get_embedding(article['text-davinci-002_summary'])
    article['full_embedding'] = full_embedding
    article['writer_summary_embedding'] = writer_summary_embedding
    article['llm_summary_embedding'] = llm_summary_embedding
save_json(llm_summ_articles, 'benchmark_llm_summarization/pairwise_evaluation_w_embeddings.json')

In [None]:
# All the News Summ
full_embeddings = json.load(open('./full/full_embedding.json'))
summary_embeddings = json.load(open('./summary/summary_embedding.json'))
full_article_dict = {x['id']:x['content'] for x in full_embeddings}
summary_article_dict = {x['id']:x['summary'] for x in summary_embeddings}
# align full and summary
ori_full_embeddings_dict = { x['id']:x['embedding'] for x in full_embeddings}
summary_embeddings_dict = { x['id']:x['embedding'] for x in summary_embeddings}
full_embeddings_dict = {}
for full_embedding_id, full_embedding in ori_full_embeddings_dict.items():
    if full_embedding_id in summary_embeddings_dict:
        full_embeddings_dict[full_embedding_id] = full_embedding
len(full_embeddings_dict), len(summary_embeddings_dict)

In [None]:
from collections import defaultdict
# llm summ
full_embeddings_dict = {}
full_article_dict = {}
summary_embeddings_dict = defaultdict(list)
articles = json.load(open('benchmark_llm_summarization/embeddings.json'))
for article in articles:
    full_embeddings_dict[article['article_id']] = article['full_embedding']
    full_article_dict[article['article_id']] = article['article']
    summary_embeddings_dict[article['article_id']].append((article['summary'], article['summary_embedding']))
print(len(full_embeddings_dict), len(summary_embeddings_dict))

In [None]:
del_ids = [id for id, e_summs in summary_embeddings_dict.items() if len(e_summs) <= 1]
for id in del_ids:
    del summary_embeddings_dict[id]
    del full_embeddings_dict[id]
print(len(full_embeddings_dict), len(summary_embeddings_dict))

In [None]:
summary_article_dict_0 = {id: e_summs[0][0] for id, e_summs in summary_embeddings_dict.items()}
summary_embeddings_dict_0 = {id: e_summs[0][1] for id, e_summs in summary_embeddings_dict.items()}
summary_article_dict_1 = {id: e_summs[1][0] for id, e_summs in summary_embeddings_dict.items()}
summary_embeddings_dict_1 = {id: e_summs[1][1] for id, e_summs in summary_embeddings_dict.items()}

In [None]:
def linear_regression(full_embeddings_dict, summary_embeddings_dict):
    embedding_pairs = []
    for article_id in full_embeddings_dict.keys():
        embedding_pairs.append((article_id, full_embeddings_dict[article_id], summary_embeddings_dict[article_id]))
    ids = [embedding_pair[0] for embedding_pair in embedding_pairs]
    X = np.array([embedding_pair[1] for embedding_pair in embedding_pairs])
    Y = np.array([embedding_pair[2] for embedding_pair in embedding_pairs])
    W = np.linalg.inv(X.T @ X) @ X.T @ Y
    Y_pred = X @ W
    residuals = Y - Y_pred
    return X, Y, W, Y_pred, residuals, ids

# outlier detection
def outlier_deviation(residuals):
    covariance_matrix = np.cov(residuals, rowvar=False)
    mahalanobis_distances = np.sqrt(np.sum(residuals @ np.linalg.inv(covariance_matrix) * residuals, axis=1))
    outlier_threshold = 48
    outliers = np.where(mahalanobis_distances > outlier_threshold)[0]
    print(len(outliers))

def outlier_distance(X, Y):
    # distances = np.sqrt(np.sum((X - Y) ** 2, axis=1)) # euclidean distance
    distances = np.array(list(map(lambda pair: 1 - cos_sim(pair[0], pair[1]), zip(X, Y))))
    outlier_threshold = 0.1
    outliers = np.where(distances > outlier_threshold)[0]
    print(len(outliers), outliers)

def k_nn(k, X, x):
    distances = np.linalg.norm(X - x, axis=1)
    k_nearest_indices = np.argsort(distances)[:k]
    # k_nearest_neighbors = X[k_nearest_indices]
    return k_nearest_indices

def compare_article(id, full_article_dict, summary_article_dict):
    print(summary_article_dict[id])
    print("==========================")
    print(full_article_dict[id])

In [None]:
X, Y, W, Y_pred, residuals, ids = linear_regression(full_embeddings_dict, summary_embeddings_dict_0)
outlier_distance(X, Y)
compare_article(ids[2], full_article_dict, summary_article_dict_0)
X, Y, W, Y_pred, residuals, ids = linear_regression(full_embeddings_dict, summary_embeddings_dict_1)
outlier_distance(X, Y)
compare_article(ids[2], full_article_dict, summary_article_dict_1)

In [None]:
outlier_index = 2
outlier_y = Y_pred[outlier_index]
outlier_nerghborhood = k_nn(3, X, outlier_y)
print(outlier_nerghborhood)

In [29]:
import json
import math
likert_articles = json.load(open('benchmark_llm_summarization/likert_evaluation_results.json')) 
has_summary = [article for article in likert_articles if type(article['summary']) is not float]
full_embedding_dict = {}
summary_embedding_dict = {}
total = len(has_summary)
for index, article in enumerate(has_summary):
    print("Processing article {} of {}".format(index, total))
    full_text = article['article']
    summary = article['summary']
    if full_text in full_embedding_dict:
        full_embedding = full_embedding_dict[full_text]
    else:
        full_embedding = get_embedding(full_text)
    if summary in summary_embedding_dict:
        summary_embedding = summary_embedding_dict[summary]
    else:
        summary_embedding = get_embedding(summary)
    article['full_embedding'] = full_embedding
    article['summary_embedding'] = summary_embedding
save_json(has_summary, 'benchmark_llm_summarization/likert_evaluation_w_embeddings.json')


Processing article 0 of 10947
Processing article 1 of 10947
Processing article 2 of 10947
Processing article 3 of 10947
Processing article 4 of 10947
Processing article 5 of 10947
Processing article 6 of 10947
Processing article 7 of 10947
Processing article 8 of 10947
Processing article 9 of 10947
Processing article 10 of 10947
Processing article 11 of 10947
Processing article 12 of 10947
Processing article 13 of 10947
Processing article 14 of 10947
Processing article 15 of 10947
Processing article 16 of 10947
Processing article 17 of 10947
Processing article 18 of 10947
Processing article 19 of 10947
Processing article 20 of 10947
Processing article 21 of 10947
Processing article 22 of 10947
Processing article 23 of 10947
Processing article 24 of 10947
Processing article 25 of 10947
Processing article 26 of 10947
Processing article 27 of 10947
Processing article 28 of 10947
Processing article 29 of 10947
Processing article 30 of 10947
Processing article 31 of 10947
Processing article

KeyboardInterrupt: 