In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
import torch
import pickle
import csv
import os
import datasets
from pathlib import Path
import pandas as pd
import yaml

In [None]:
# model = AutoModelForCausalLM.from_pretrained("/data/bigcode-starcoder",output_hidden_states=True).half().cuda()

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("/data/bigcode-starcoder")

In [None]:
def retrieve_embedding(prompt):
    inputs = tokenizer(prompt,return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    last_hidden = outputs['hidden_states'][-1]
    avg = torch.mean(last_hidden,dim=1)
    return avg.cpu().detach().numpy()[0]

In [None]:
def pickle_embedding(embedding,prompt,filename):
    data = {"input": prompt, "embedding": embedding, "model": "BigCode"}
    with open(filename, "ab") as f:
        pickle.dump(data, f)
    

Notice how I'm loading the data below!

In [None]:
interactions_csv = pd.read_csv("./raw_data/interactions.csv", index_col=0)
print("Not all rows displayed below.")
interactions_csv.head()


In [None]:
interaction_dataset_by_id = {int(interaction["id"]): interaction for index, interaction in interactions_csv.iterrows()}
interaction_dataset_by_id[4908]["prompt"]

In [None]:
for id, interaction in interaction_dataset_by_id.items():
    filename = f"embeddings/embedding_{id}_bigcode.pkl"
    if not os.path.exists(filename):
        prompt = interaction['prompt']
        if prompt is not None:
            embedding = retrieve_embedding(prompt)
            pickle_embedding(embedding,prompt,filename)

In [None]:
with open("raw_data/problems.yaml") as f:
    problems_file = yaml.load(f, Loader=yaml.FullLoader)

golden_embeddings_dir = Path("embeddings/golden_embeddings")
golden_embeddings_dir.mkdir(parents=True, exist_ok=True)

golden_prompts = {}
for problem in problems_file:
    golden_embedding_filename = golden_embeddings_dir / f"{problem}_bigcode.pkl"
    signature = problems_file[problem]["signature"]
    description = problems_file[problem]["working_description"]
    prompt = f'{signature}    """\n    {description}    """\n    '
    golden_prompts[problem] = prompt
    if not golden_embedding_filename.exists():
        embedding = retrieve_embedding(prompt)
        pickle_embedding(embedding,prompt,golden_embedding_filename)

In [None]:
def load_past_embeddings(filename):
    with open(filename, "rb") as f:
        embedding = pickle.load(f)
    return embedding

In [None]:
load_past_embeddings("embeddings/golden_embeddings/exp_bigcode.pkl")

In [None]:
def load_all_past_embeddings(path):
    import re
    retval=[]
    for f in path.glob("embedding_*_bigcode.pkl"):
        info = re.compile(r'embedding_(?P<id>\d+)_(?P<model>.+).pkl').search(f.name)
        info = info.groupdict()
        emb = load_past_embeddings(f)
        if interaction_dataset_by_id.get(int(info["id"])) is not None:
            emb["interaction"] = interaction_dataset_by_id[int(info["id"])]
            retval.append(emb)
    return retval

def load_all_golden_embeddings(path):
    import re
    retval = []
    for f in path.glob("*_bigcode.pkl"):
        emb = load_past_embeddings(f)
        emb["name"] = f.name[:f.name.rfind("_")]
        retval.append(emb)
    return retval

all_embs = load_all_past_embeddings(Path("./embeddings"))
gold_embs = load_all_golden_embeddings(Path("./embeddings/golden_embeddings"))

In [None]:
from sklearn.manifold import TSNE
import numpy as np

embs_arr = [e["embedding"] for e in all_embs]
gold_embs_index = len(embs_arr)
gold_embs_idx_by_name = {}
gold_embs_by_name = {}
for gold in gold_embs:
    gold_embs_idx_by_name[gold["name"]] = gold_embs_index
    gold_embs_by_name[gold["name"]] = gold["embedding"]
    embs_arr.append(gold["embedding"])
    gold_embs_index += 1


print(gold_embs_idx_by_name)

In [None]:
embs = np.array(embs_arr)

perplexity = 20

embs_proj = TSNE(n_components=2, perplexity=perplexity).fit_transform(embs)

passed_indices_all, failed_indices_all = [], []

def is_emb_passed(emb):
    return emb["interaction"]["tests_passed"] == emb["interaction"]["total_tests"]

for i, emb in enumerate(all_embs):
    if is_emb_passed(emb):
        passed_indices_all.append(i)
    else:
        failed_indices_all.append(i)

all_embs_idx_by_id = {emb["interaction"]["id"]: i for i, emb in enumerate(all_embs)}


In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(figsize=(8,8))
ax.set(xlim=(-80,80), ylim=(-80,80))
colors = {"pass":"#118AB2", "fail":"#EF476F", "gold":"#fb8500"}

ax.scatter(embs_proj[passed_indices_all, 0], embs_proj[passed_indices_all, 1], alpha=0.5, c=colors["pass"], label="pass")
ax.scatter(embs_proj[failed_indices_all, 0], embs_proj[failed_indices_all, 1], alpha=0.5, c=colors["fail"], label="fail")

ax.legend(fontsize='large', markerscale=2)
plt.title(f"t-SNE Projection of Last Layer Embeddings, Perplexity={perplexity}")
plt.show()

In [None]:
passing_per_problem, failing_per_problem = {}, {}

per_problem_idx = {}

for i in range(len(all_embs)):
    problem = all_embs[i]["interaction"]["problem"]
    per_problem_idx[problem] = per_problem_idx.get(problem, []) + [i]
    

In [None]:
with open("colors.txt") as f:
    colors_f = f.read().splitlines()
    colors_dict = dict(zip(colors_f[::2], colors_f[1::2]))

In [None]:
fig, ax = plt.subplots(figsize=(16,10))
color_vals = iter(colors_dict.values())

for problem_i, problem in enumerate(sorted(per_problem_idx)):
    passed_indices = [i for i in per_problem_idx[problem] if is_emb_passed(all_embs[i])]
    failed_indices = [i for i in per_problem_idx[problem]  if is_emb_passed(all_embs[i]) == False]
    color = next(color_vals)
    ax.scatter(embs_proj[passed_indices, 0], embs_proj[passed_indices, 1], alpha=0.5, color=color, label=f"{problem}:pass")
    ax.scatter(embs_proj[failed_indices, 0], embs_proj[failed_indices, 1], alpha=0.5, label=f"{problem}:fail", facecolors="none", edgecolors=color)

ax.legend(fontsize='small', markerscale=2, bbox_to_anchor=(1.1, 1.05), ncol=3)
plt.title(f"t-SNE Projection of Last Layer Embeddings, Perplexity={perplexity}")
plt.show()

In [None]:
xlim = (30, 40)
ylim = (5, 25)

fig, ax = plt.subplots(figsize=(4,4))
color_vals = iter(colors_dict.values())
poi_ids = [4433, 4368]
poi_label = 1

for problem_i, problem in enumerate(sorted(per_problem_idx)):
    passed_indices = [i for i in per_problem_idx[problem] if is_emb_passed(all_embs[i]) and xlim[0] <= embs_proj[i, 0] <= xlim[1] and ylim[0] <= embs_proj[i, 1] <= ylim[1]]
    failed_indices = [i for i in per_problem_idx[problem]  if is_emb_passed(all_embs[i]) == False and xlim[0] <= embs_proj[i, 0] <= xlim[1] and ylim[0] <= embs_proj[i, 1] <= ylim[1]]
    color = next(color_vals)
    if len(passed_indices) > 0:
        ax.scatter(embs_proj[passed_indices, 0], embs_proj[passed_indices, 1], alpha=0.5, color=color, label=f"{problem}:pass")
    if len(failed_indices) > 0:
        ax.scatter(embs_proj[failed_indices, 0], embs_proj[failed_indices, 1], alpha=0.5, label=f"{problem}:fail", facecolors="none", edgecolors=color)
    if poi_ids is not None:
        for poi_id in poi_ids:
            i = all_embs_idx_by_id[poi_id]
            if i not in passed_indices and i not in failed_indices:
                continue
            ax.annotate(poi_label, (embs_proj[i, 0]-0.25, embs_proj[i, 1]-0.25))
            poi_label += 1

ax.legend(fontsize='small', markerscale=2)
plt.show()
fig.savefig("embeddings-testcases.pdf", bbox_inches='tight')

In [None]:
xlim = (-20, 20)
ylim = (-60,-40)

fig, ax = plt.subplots(figsize=(6,6))
color_vals = iter(colors_dict.values())
#poi_ids = [4433, 7662]
#poi_label = 1

for problem_i, problem in enumerate(sorted(per_problem_idx)):
    passed_indices = [i for i in per_problem_idx[problem] if is_emb_passed(all_embs[i]) and xlim[0] <= embs_proj[i, 0] <= xlim[1] and ylim[0] <= embs_proj[i, 1] <= ylim[1]]
    failed_indices = [i for i in per_problem_idx[problem]  if is_emb_passed(all_embs[i]) == False and xlim[0] <= embs_proj[i, 0] <= xlim[1] and ylim[0] <= embs_proj[i, 1] <= ylim[1]]
    color = next(color_vals)
    if len(passed_indices) > 0:
        ax.scatter(embs_proj[passed_indices, 0], embs_proj[passed_indices, 1], alpha=0.5, color=color, label=f"{problem}:pass")
    if len(failed_indices) > 0:
        ax.scatter(embs_proj[failed_indices, 0], embs_proj[failed_indices, 1], alpha=0.5, label=f"{problem}:fail", facecolors="none", edgecolors=color)
    """
    if poi_ids is not None:
        for poi_id in poi_ids:
            i = all_embs_idx_by_id[poi_id]
            if i not in passed_indices and i not in failed_indices:
                continue
            ax.annotate(poi_label, (embs_proj[i, 0]-0.25, embs_proj[i, 1]-0.25))
            poi_label += 1
    """

ax.legend(markerscale=2,prop={'family': 'Arial','size':14})
plt.show()
fig.savefig("embeddings-testcases.pdf", bbox_inches='tight')

In [None]:
# An interactive plot of above

# The following two lines are needed for the plot to show up in JupyterLab
# import plotly.io as pio
# pio.renderers.default="iframe"


import plotly.graph_objects as go
import textwrap


fig = go.Figure()

color_vals = iter(colors_dict.values())
WRAP_WIDTH = 30

for problem_i, problem in enumerate(sorted(per_problem_idx)):
    passed_indices = [i for i in per_problem_idx[problem] if is_emb_passed(all_embs[i])]
    failed_indices = [i for i in per_problem_idx[problem]  if is_emb_passed(all_embs[i]) == False]
    passed_hover_text = ["PASS"+"<br>".join([str(all_embs[i]["interaction"]["id"])] + textwrap.wrap(all_embs[i]["interaction"]["prompt"], width=WRAP_WIDTH)) for i in passed_indices]
    failed_hover_text = ["FAIL"+"<br>".join([str(all_embs[i]["interaction"]["id"])] + textwrap.wrap(all_embs[i]["interaction"]["prompt"], width=WRAP_WIDTH)) for i in failed_indices]
    golden_hover_text = "GOLD"+"<br>".join(textwrap.wrap(golden_prompts[problem], width=WRAP_WIDTH))
    color = next(color_vals)
    fig.add_trace(go.Scatter(x=embs_proj[passed_indices, 0], y=embs_proj[passed_indices, 1], 
                             mode='markers', marker=dict(color=color, line=dict(color=color, width=2)), name=f"{problem}:pass", hovertext=passed_hover_text))
    fig.add_trace(go.Scatter(x=embs_proj[failed_indices, 0], y=embs_proj[failed_indices, 1], 
                             mode='markers', marker=dict(color="rgba(0,0,0,0)",line=dict(color=color, width=2)), name=f"{problem}:fail", hovertext=failed_hover_text))
    fig.add_trace(go.Scatter(x=[embs_proj[gold_embs_idx_by_name[problem], 0]], y=[embs_proj[gold_embs_idx_by_name[problem], 1]],
                             mode='markers', marker=dict(color=color, size=10), marker_symbol="star", name=f"{problem}:canon", hovertext=golden_hover_text))

fig.update_layout(title=f"t-SNE Projection of Last Layer Embeddings, Perplexity={perplexity}<br><sup>(Double click on the labels to select all or none)</sup>", xaxis_title="x", yaxis_title="y", legend_title="Problem", width=1800, height=1200)
fig.show()

In [None]:
print(interaction_dataset_by_id[4714]["prompt"])

#### Interesting observations in embeddings

- `andCount`: Clearly the ones closer to the canonical prompt mentions number of '&' character while the ones further apart did not.


- `combine`: Two set of solutions exist: left upper corner embeddings are more detailed in description (e.g. "Takes an input of two lists, l1 and l2, each of which also contains lists. It combines the first list in l1 with the first one in l2, then continues for all items in l1 and l2. It outputs this final list which is a combination of l1 and l2."), and right bottom corner embeddings (including the canonical prompt) contain a much shorter description (e.g. "combine the first half of the lists with the second half of the lists")

- `fib`: Upper left embeddings mentions the word "Fibonacci" while the bottom right embeddings used test-case based description (e.g. "a function check if n==1 or n==13 or n==21")

- `sortedBooks`: A few failing prompts closer to the canonical are failing since sorting by year value is not mentioned.

- `increaseScore`: Most prompts are in one cluster except for one:

```
def increaseScore(score): takes input '-10' and outputs its positive integer
def increaseScore(score): inputs '1' and adds 9
def increaseScore(score): adds 1 to each input '10', '15', '20'
```



In [None]:
# Produce a 11 row 5 col combined indivisual plots
cmap = matplotlib.colormaps["tab20b"]
norm = matplotlib.colors.Normalize(vmin=0.0, vmax=float(len(per_problem_idx)))
figs, axs = plt.subplots(11,5,figsize=(20,44))

for problem_i, problem in enumerate(sorted(per_problem_idx)):
    ax = axs[problem_i // 5,problem_i % 5] 
    # ax.set(xlim=(-80,80), ylim=(-80,80))
    passed_indices = [i for i in per_problem_idx[problem] if is_emb_passed(all_embs[i])]
    failed_indices = [i for i in per_problem_idx[problem]  if is_emb_passed(all_embs[i]) == False]
    ax.scatter(embs_proj[passed_indices, 0], embs_proj[passed_indices, 1], alpha=0.5, c=colors["pass"], label=f"{problem}:pass")
    ax.scatter(embs_proj[failed_indices, 0], embs_proj[failed_indices, 1], alpha=0.5, label=f"{problem}:fail", facecolors="none", edgecolors=colors["fail"])
    ax.scatter(embs_proj[gold_embs_idx_by_name[problem], 0], embs_proj[gold_embs_idx_by_name[problem], 1], alpha=0.5, label=f"{problem}:canonical", marker="*", color=colors["gold"])
    ax.legend(fontsize='small', markerscale=2)
    ax.set_title(f"t-SNE Projection of \n Last Layer Embeddings\nPerplexity={perplexity}, problem={problem}", fontsize="small")
    

print("I stopped pegging the embedding plot from (-80, -80) to (80, 80) since almost everything is clustered together")
plt.show()

In [None]:
# Produce individual plots
def make_plots(per_problem, problems, filename_prefix="", poi_ids = None, poi_label_start=0,scale=4):
    poi_label = poi_label_start
    for problem_i, problem in enumerate(problems):
        fig, ax = plt.subplots(figsize=(scale,scale))
        passed_indices = [i for i in per_problem[problem] if is_emb_passed(all_embs[i])] 
        failed_indices = [i for i in per_problem[problem]  if is_emb_passed(all_embs[i]) == False]
        ax.scatter(embs_proj[passed_indices, 0], embs_proj[passed_indices, 1], alpha=0.5, c=colors["pass"], label=f"pass")
        ax.scatter(embs_proj[failed_indices, 0], embs_proj[failed_indices, 1], alpha=0.5, label=f"fail", facecolors="none", edgecolors=colors["fail"])
        ax.scatter(embs_proj[gold_embs_idx_by_name[problem], 0], embs_proj[gold_embs_idx_by_name[problem], 1], alpha=0.5, label=f"canonical", marker="*", color=colors["gold"])
        ax.legend(markerscale=2,prop={'family': 'Arial','size':14})

        # label the points of interest
        if poi_ids is not None:
            cur_poi_ids = poi_ids[problem_i]
            for label, poi_id in enumerate(cur_poi_ids):
                i = all_embs_idx_by_id[poi_id]
                ax.annotate(poi_label, (embs_proj[i, 0]-0.25, embs_proj[i, 1]-0.25))
                poi_label += 1
        
        plt.show()
        fig.savefig(f"{filename_prefix}{problem}.pdf", bbox_inches='tight')

# Produce All plots:
# make_plots(per_problem,per_problem.keys())

In [None]:
make_plots(per_problem_idx, ["increaseScore", "combine"], "", [[6138, 7649], [3967, 5374]], poi_label_start=poi_label)


In [None]:
make_plots(per_problem_idx, ["increaseScore", "combine"], "", poi_label_start=poi_label,scale=5)


In [None]:
make_plots(per_problem_idx, ["check_for_aspen"],scale=5)

## Cosine Similarities between embeddings

In [None]:
def calculate_cos_sim(emb1, emb2, prompt="unknown"):
    retval = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
    return retval

from tqdm import tqdm
cos_similarities = {}

cos_sim_file = Path("computed_data/embeddings_cos_sim.csv")
if cos_sim_file.exists():
    import csv
    with open(cos_sim_file) as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            id1 = int(row[0])
            id2 = int(row[1])
            similarity = float(row[2])
            cos_similarities[id1] = cos_similarities.get(id1, {})
            cos_similarities[id1][id2] = similarity
else:
    for e1 in tqdm(all_embs):
        cos_similarities[e1["interaction"]["id"]] = cos_similarities.get(e1["interaction"]["id"], {})
        for e2 in all_embs:
            if e2["interaction"]["id"] < e1["interaction"]["id"] and \
                e2["interaction"]["id"] not in cos_similarities[e1["interaction"]["id"]]:
                cos_similarities[e1["interaction"]["id"]][e2["interaction"]["id"]] = calculate_cos_sim(e1["embedding"], e2["embedding"])
    with open(cos_sim_file, "w") as csvfile:
        writer = csv.writer(csvfile)
        for id1 in cos_similarities:
            for id2 in cos_similarities[id1]:
                writer.writerow([id1, id2, cos_similarities[id1][id2]])


In [None]:
import textwrap
# Plot cosine similarity for all embeddings for some problems on one plot
def plot_cos_similarities_vs_canon(problems):
    fig, ax = plt.subplots(figsize=(20,10))
    cos_sims = {}
    for problem_i, problem in enumerate(problems):
        gold_emb = gold_embs_by_name[problem]
        user_embs = [all_embs[i]["embedding"] for i in per_problem_idx[problem]]
        cos_sims[problem] = [calculate_cos_sim(gold_emb, user_emb) for user_emb in user_embs]

    ax.boxplot(cos_sims.values(), labels=cos_sims.keys())
    # Put labels at top
    ax.xaxis.set_ticks_position('top')
    ax.set_xticklabels(cos_sims.keys(), rotation=40, ha="left")
    
    # Cut the bars to be between 0.90 and 1
    ax.set_ylim(0.6, 1.005)

    if problems == sorted(per_problem_idx.keys()):
        title_problems = "all"
    else:
        title_problems = "\n".join(textwrap.wrap(str(problems), 50))
    plt.title(f"Cosine Similarity of User Embeddings to Canonical Embedding\nproblems={title_problems}")
    plt.show()


plot_cos_similarities_vs_canon(sorted(per_problem_idx.keys()))

In [None]:
# Plot avg cosine similarity for all user embeddings for some problems on one plot
import itertools
def plot_cos_similarities_vs_all(problems):
    fig, ax = plt.subplots(figsize=(20,10))
    cos_sims = {}
    for problem_i, problem in enumerate(problems):
        gold_emb = gold_embs_by_name[problem]
        user_embs = [all_embs[i]["embedding"] for i in per_problem_idx[problem]]
        cos_sims[problem] = [calculate_cos_sim(user_emb1, user_emb2) for user_emb1, user_emb2 in itertools.combinations(user_embs, 2)]

    #print(cos_sims.values())
    ax.boxplot(cos_sims.values(), labels=cos_sims.keys())

    ax.xaxis.set_ticks_position('top')
    ax.set_xticklabels(cos_sims.keys(), rotation=40, ha="left")

    # Cut the bars to be between 0.90 and 1
    ax.set_ylim(0.6, 1.005)

    if problems == sorted(per_problem_idx.keys()):
        title_problems = "all"
    else:
        title_problems = "\n".join(textwrap.wrap(str(problems), 50))
    plt.title(f"Cosine Similarity of User Embeddings\nproblems={title_problems}")
    plt.show()

plot_cos_similarities_vs_all(sorted(per_problem_idx.keys()))

In [None]:
import textwrap
# Plot cosine similarity for all embeddings for some problems on one plot
def plot_cos_sims_categories(categories=["first_success", "last_success", "first_failure", "last_failure"]):
    fig, ax = plt.subplots(figsize=(8,10))
    cos_sims = {}
    for category in categories:
        cos_sims[category] = []
        for problem_i, problem in enumerate(sorted(per_problem_idx.keys())):
            gold_emb = gold_embs_by_name[problem]
            user_embs = [all_embs[i]["embedding"] for i in per_problem_idx[problem] if all_embs[i]["interaction"][f"is_{category}"]]
            appended_values = [calculate_cos_sim(gold_emb, user_emb) for user_emb in user_embs]
            for emb_i, user_emb in enumerate(user_embs):
                if appended_values[emb_i] < 0.75:
                    golden_prompt = golden_prompts[problem]
                    user_prompts = [all_embs[i]["interaction"]["prompt"] for i in per_problem_idx[problem] if all_embs[i]["interaction"][f"is_{category}"]]
                    print("cos_sim < 0.75")
                    print(f"problem: {problem}")
                    print(f"category: {category}")
                    print(f"golden_prompt:\n{golden_prompt}")
                    print(f"user_prompt:\n{user_prompts[emb_i]}")
            cos_sims[category] += appended_values
            

    ax.boxplot(cos_sims.values(), labels=cos_sims.keys())
    # Put labels at top
    ax.xaxis.set_ticks_position('top')
    ax.set_xticklabels(cos_sims.keys(), rotation=40, ha="left")
    
    # Cut the bars to be between 0.90 and 1
    ax.set_ylim(0.6, 1.005)

    title_problems = "\n".join(textwrap.wrap(str(categories), 50))
    plt.title(f"Cosine Similarity of User Embeddings to Canonical Embedding\ncategories={title_problems}")
    plt.show()


plot_cos_sims_categories()