In [1]:
#pip install scikit-learn

In [1]:
# importing required libraries

import pandas as pd
import numpy as np
import openai
import json
import re
import matplotlib
import sklearn
from openai.embeddings_utils import get_embedding, cosine_similarity


In [2]:
# cleaning the data

def clean(text):
    new_text = re.sub('\n', '', text)
    return new_text

In [5]:
# import json files

# context 
fc = open("textbook_embeddings/sections.json")
section_data_json = json.load(fc)

# take queries from neha's files
fq = open("textbook_embeddings/GPT-3_generations_section_level.json", encoding="utf8")
gpt3_section_data = json.load(fq)

In [6]:
gpt3_section_data[0]

{'textbook-paragraph': "{Finite State Machine Design Examples, Part I}\n\nThis set of notes uses a series of examples to illustrate design principles \nfor the implementation of finite state machines (FSMs) using digital logic.\nWe begin with an overview of the design process for a digital FSM, from\nthe development of an abstract model through the implementation of\nfunctions for the next-state variables and output signals.\nOur first few examples cover only the concrete aspects:\nwe implement several counters, which illustrate the basic \nprocess of translating a concrete and complete state transition diagram\ninto an implementation based on flip-flops and logic gates.\nWe next consider a counter with a number of states that is not a power of\ntwo, with which we illustrate the need for FSM initialization.\n As part of solving the initialization problem, we also introduce \n a general form of selection logic called a multiplexer.\n\nWe then consider the design process as a whole throu

In [7]:
# collecting all queries in a different df

queries = []
for i in gpt3_section_data:
    query_str = i['GPT-3-Generations']['question']
    query_str = clean(query_str)
    queries.append(query_str)
    
queries_df = pd.DataFrame(queries, columns=['query'])

In [8]:
queries_df['query'][1]

'What is the design process for a digital FSM?'

In [18]:
api_key = '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'
openai.api_key = api_key

In [19]:
# embed the queries and context

q1 = get_embedding(queries_df['query'][1], engine="text-embedding-ada-002")
q1

[0.002018889645114541,
 0.01678147353231907,
 -0.02255273051559925,
 -0.005718728061765432,
 -0.017776034772396088,
 0.008635874837636948,
 -0.0318259671330452,
 -0.0002674634743016213,
 -0.026138754561543465,
 -0.01784607395529747,
 0.004601597785949707,
 0.03572016581892967,
 -0.010337836109101772,
 0.004391478840261698,
 -0.010568966157734394,
 0.013034357689321041,
 0.005053352564573288,
 0.010099701583385468,
 0.013188445009291172,
 0.006040910258889198,
 -0.012270926497876644,
 0.013391559943556786,
 0.015478737652301788,
 -0.018826628103852272,
 -0.021628208458423615,
 -0.0006872629746794701,
 0.0146802868694067,
 -0.021390074864029884,
 -0.029304541647434235,
 0.002332316478714347,
 0.029892874881625175,
 -0.010477914474904537,
 -0.01930289715528488,
 -0.016669409349560738,
 0.000943782739341259,
 -0.006601226516067982,
 -0.0035124828573316336,
 -0.009868570603430271,
 0.0022710319608449936,
 -0.021824318915605545,
 0.01147247664630413,
 -0.00035457516787573695,
 0.005865811370

In [20]:
# convert the json into a df

section_df = pd.json_normalize(section_data_json)
section_df = section_df.transpose()
section_df.columns = ['context']
section_df['context'] = section_df.context.apply(lambda x: clean(x))

In [21]:
section_df.head()

Unnamed: 0,context
0,"{Finite State Machine Design Examples, Part I}..."
1,{Steps in the Design Process}Before we begin e...
2,{Example: A Two-Bit Gray Code Counter}Let's be...
3,{Example: A Three-Bit Gray Code Counter}Now we...
4,{Example: A Color Sequencer}Early graphics sys...


In [22]:
section_df['ada_embedding'] = section_df.context.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))

In [23]:
section_df.head()

Unnamed: 0,context,ada_embedding
0,"{Finite State Machine Design Examples, Part I}...","[-0.014471707865595818, 0.010982639156281948, ..."
1,{Steps in the Design Process}Before we begin e...,"[0.0012329797027632594, 0.016857951879501343, ..."
2,{Example: A Two-Bit Gray Code Counter}Let's be...,"[-0.0074892048723995686, 0.015044862404465675,..."
3,{Example: A Three-Bit Gray Code Counter}Now we...,"[-0.012359922751784325, 0.005069629289209843, ..."
4,{Example: A Color Sequencer}Early graphics sys...,"[-0.007240018341690302, -0.008068212307989597,..."


In [24]:
question = queries_df['query'][1]
question_embedding = get_embedding(question, engine="text-embedding-ada-002")

In [25]:
question

'What is the design process for a digital FSM?'

In [26]:
temp_context = section_df

In [27]:
temp_context["similarity"] = temp_context.ada_embedding.apply(lambda x: cosine_similarity(x, question_embedding))

In [28]:
# best match
temp_context.sort_values("similarity", ascending = False).head(1)

Unnamed: 0,context,ada_embedding,similarity
1,{Steps in the Design Process}Before we begin e...,"[0.0012329797027632594, 0.016857951879501343, ...",0.897585


In [29]:
# worst match
df = temp_context.sort_values("similarity", ascending = True).head(3)

In [30]:
df.iloc[0]

context          {Changing Types in C*}Changing the type of a d...
ada_embedding    [0.008865782991051674, 0.011089006438851357, 0...
similarity                                                0.679945
Name: 138, dtype: object

In [31]:
df['context'][0]



In [32]:
training_data = pd.DataFrame(columns = ['query', 'pos_a', 'neg_a1', 'neg_a2', 'neg_a3'])
training_data_embeddings = pd.DataFrame(columns = ['query', 'query_embedding', 'pos_a', 'pos_a_embedding', 
                                                   'neg_a1', 'neg_a1_embedding', 'neg_a2', 'neg_a2_embedding',
                                                   'neg_a3', 'neg_a3_embedding'])

In [33]:
for q in queries_df['query']:
    
    row = []
    question_embedding = get_embedding(q, engine="text-embedding-ada-002")
    
    temp_context["similarity"] = temp_context.ada_embedding.apply(lambda x: cosine_similarity(x, question_embedding))
    most_relevant_row = temp_context.sort_values("similarity", ascending = False).head(1)
    least_relevant_rows = temp_context.sort_values("similarity", ascending = True).head(3)
    
    row.append(q)
    row.append(question_embedding)
    row.append(most_relevant_row.iloc[-1]['context'])
    row.append(most_relevant_row.iloc[-1]['ada_embedding'])
    
    for i in range(len(least_relevant_rows)):
        row.append(least_relevant_rows.iloc[i]['context'])
        row.append(least_relevant_rows.iloc[i]['ada_embedding'])
    
    training_data_embeddings.loc[len(training_data_embeddings)] = row
    
    #print(training_data_embeddings)
    #break

In [34]:
training_data_embeddings

Unnamed: 0,query,query_embedding,pos_a,pos_a_embedding,neg_a1,neg_a1_embedding,neg_a2,neg_a2_embedding,neg_a3,neg_a3_embedding
0,Which of the following is not a step in the de...,"[-0.011453937739133835, 0.00713815214112401, -...",{Steps in the Design Process}Before we begin e...,"[0.0012329797027632594, 0.016857951879501343, ...",{SEC-DED Codes}We now consider one final exten...,"[-0.01453531626611948, 0.028209280222654343, -...",{Interrupts and Exceptions*}Unexpected process...,"[-0.032872192561626434, -0.009567380882799625,...",{The C Preprocessor*}The C language uses a pre...,"[-0.02346784994006157, 0.013030974194407463, -..."
1,What is the design process for a digital FSM?,"[0.0020188007038086653, 0.016780732199549675, ...",{Steps in the Design Process}Before we begin e...,"[0.0012329797027632594, 0.016857951879501343, ...",{Changing Types in C*}Changing the type of a d...,"[0.008865782991051674, 0.011089006438851357, 0...",{The C Preprocessor*}The C language uses a pre...,"[-0.02346784994006157, 0.013030974194407463, -...",{Overflow for Unsigned Addition}Let's say that...,"[0.0020270796958357096, -0.02656855247914791, ..."
2,How does the choice of representation for the ...,"[0.013305346481502056, 0.0019384929910302162, ...",{Impact of the State Representation}What happe...,"[-0.0038557788357138634, -0.009720277972519398...",{The C Preprocessor*}The C language uses a pre...,"[-0.02346784994006157, 0.013030974194407463, -...",{Interrupts and Exceptions*}Unexpected process...,"[-0.032872192561626434, -0.009567380882799625,...",{Procedure and System Calls*}A { procedure} is...,"[-0.001410436350852251, -0.008392180316150188,..."
3,What is the first step in the design process?A...,"[-0.001720355823636055, -0.002152037573978305,...",{Steps in the Design Process}Before we begin e...,"[0.0012329797027632594, 0.016857951879501343, ...",{SEC-DED Codes}We now consider one final exten...,"[-0.01453531626611948, 0.028209280222654343, -...",{Deriving 2's Complement}Given these equivalen...,"[-0.019589651376008987, -0.0020448309369385242...",{Overflow for Unsigned Addition}Let's say that...,"[0.0020270796958357096, -0.02656855247914791, ..."
4,What is the difference between Step {step-abs}...,"[0.02049386315047741, 0.003984917886555195, 0....",{Steps in the Design Process}Before we begin e...,"[0.0012329797027632594, 0.016857951879501343, ...",{SEC-DED Codes}We now consider one final exten...,"[-0.01453531626611948, 0.028209280222654343, -...",{Hamming Codes}Hamming also developed a genera...,"[-0.024023352190852165, 0.017182711511850357, ...",{Overflow for Unsigned Addition}Let's say that...,"[0.0020270796958357096, -0.02656855247914791, ..."
...,...,...,...,...,...,...,...,...,...,...
427,What is the overflow condition for unsigned ad...,"[0.002077223267406225, -0.0228566937148571, 0....",{Overflow for Unsigned Addition}Let's say that...,"[0.0020270796958357096, -0.02656855247914791, ...",{Compilation and Interpretation*}Many programm...,"[-0.012749951332807541, 0.007861707359552383, ...",{The C Preprocessor*}The C language uses a pre...,"[-0.02346784994006157, 0.013030974194407463, -...",{The C Programming Language}Programming langua...,"[0.014745492488145828, -0.0019359153229743242,..."
428,Why does overflow occur when adding two {N-bit...,"[0.011194312013685703, -0.030556876212358475, ...",{Overflow for Unsigned Addition}Let's say that...,"[0.0020270796958357096, -0.02656855247914791, ...",{The C Preprocessor*}The C language uses a pre...,"[-0.02346784994006157, 0.013030974194407463, -...",{Implication and Mathematical Notation}Some of...,"[0.014781014062464237, 0.013696400448679924, -...",{Compilation and Interpretation*}Many programm...,"[-0.012749951332807541, 0.007861707359552383, ..."
429,Which of the following is not a condition for ...,"[-0.006390254013240337, -0.01386608462780714, ...",{Overflow for 2's Complement Addition}Understa...,"[-0.0038576170336455107, -0.008486075326800346...",{The C Preprocessor*}The C language uses a pre...,"[-0.02346784994006157, 0.013030974194407463, -...",{Basic I/O}The { main} function returns an int...,"[0.004733604844659567, 0.011300455778837204, 0...",{The C Programming Language}Programming langua...,"[0.014745492488145828, -0.0019359153229743242,..."
430,What is the range of numbers that can be repre...,"[-0.003825078485533595, -0.022557271644473076,...",{An Algebraic Approach}Some people prefer an a...,"[-0.0103315319865942, 0.01234760694205761, 0.0...",{The C Preprocessor*}The C language uses a pre...,"[-0.02346784994006157, 0.013030974194407463, -...",{Procedure and System Calls*}A { procedure} is...,"[-0.001410436350852251, -0.008392180316150188,...",{Compilation and Interpretation*}Many programm...,"[-0.012749951332807541, 0.007861707359552383, ..."


In [38]:
most_relevant_row

Unnamed: 0,context,ada_embedding,similarity
143,{Overflow for 2's Complement Addition}Understa...,"[-0.0038576170336455107, -0.008486075326800346...",0.895188


In [65]:
least_relevant_rows.iloc[0]['context']

'#include "my_header.h"    = /* search in current followed by standard directories =#include <stdio.h>      > /* search in standard directories > */#include "my_header.h" > /* search in current followed by standard directories > */'

In [76]:
least_relevant_rows.iloc[0]

context          #include "my_header.h"    = /* search in curre...
ada_embedding    [-0.034469980746507645, 0.007448077667504549, ...
similarity                                                0.661006
Name: 1917, dtype: object

In [42]:
training_data_embeddings.iloc[0]['pos_a']

"{Steps in the Design Process}Before we begin exploring designs, let's talk briefly about the generalapproach that we take when designing an FSM.  We follow a six-stepprocess:{-8pt}{{}{}{}{}{}{develop an abstract model}{step-abs}{specify I/O behavior}{step-io}{complete the specification}{step-complete}{choose a state representation}{step-repn}{calculate logic expressions}{step-logic}{implement with flip-flops and gates}{step-gates}}{-8pt}In Step {step-abs}, we translate our description in human languageinto a model with states and desired behavior.  At this stage, we simply try to capture the intent of the description and are notparticularly thorough nor exact.Step {step-io} begins to formalize the model, starting with itsinput and output behavior.  If we eventually plan to develop animplementation of our FSM as a digital system (which is not the only choice, of course!), all input and outputmust consist of bits.  Often, input and/or output specificationsmay need to match other digital

In [43]:
td_dict = training_data_embeddings.to_dict(orient='index')

In [44]:
td_dict[0]

{'query': 'Which of the following is not a step in the design process for a digital FSM?A) Developing an abstract modelB) Implementing functions for next-state variablesC) Implementing functions for output signalsD) Translating a state transition diagram into an implementationE) Initializing the FSM',
 'query_embedding': [-0.011453937739133835,
  0.00713815214112401,
  -0.013565778732299805,
  -0.002544405870139599,
  -0.04442101716995239,
  0.009355255402624607,
  -0.033263131976127625,
  0.0019259852124378085,
  -0.018052617087960243,
  -0.03128944709897041,
  0.012644726783037186,
  0.03323681652545929,
  -0.00954604521393776,
  0.023289455100893974,
  -0.021144719794392586,
  -0.00791446678340435,
  0.015736829489469528,
  0.013197357766330242,
  0.007572362199425697,
  0.008743413724005222,
  -0.012052621692419052,
  -0.003503286512568593,
  0.011078937910497189,
  -0.020184194669127464,
  -0.014105251990258694,
  -0.005141443107277155,
  0.0033388130832463503,
  -0.03999996557831

In [45]:
training_data_embeddings.to_json('fine_tune_section_training_data.json', orient='index')