In [1]:
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle as pk
from sklearn.metrics.pairwise import cosine_similarity
from functools import reduce
import os

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
#variables
models = [
    {'name': 'flan-t5-small', 'tokenizer': None, 'model': None, 'text2text': None},
    {'name': 'flan-t5-large', 'tokenizer': None, 'model': None, 'text2text': None}
]
quantiles = [0, 0.5, 0.75, 0.90]

def text2textModel(tokenizer: T5Tokenizer, model: T5ForConditionalGeneration, input_text: str) -> str:
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0])

In [3]:
for m in models:
    #load text2text model
    m['tokenizer'] = T5Tokenizer.from_pretrained(f"google/{m['name']}")
    m['model'] = T5ForConditionalGeneration.from_pretrained(f"google/{m['name']}")


for m in models:
    print(m['name'])
    print(text2textModel(tokenizer = m['tokenizer'], model = m['model'], input_text = "who is the first american president black?"))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


flan-t5-small
<pad> John McCain</s>
flan-t5-large




<pad> Abraham Lincoln</s>


In [4]:
data = []
with open('./train.json', 'r') as file:
    data = json.load(file)

contexts = list(set([e["context"] for e in data]))

In [78]:
#fit the vectorizer and dump it
dataForFit = contexts + [e["question"] for e in data]
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
vectorizer.fit(dataForFit)
pk.dump(vectorizer, open('vectorizer.pickle', 'wb'))



In [5]:
vectorizer = pk.load(open('vectorizer.pickle', 'rb'))

In [6]:
corpusTFIDF = vectorizer.transform(contexts)
#features = vectorizer.get_feature_names_out()
#Xs = pd.DataFrame(corpusTFIDF.toarray(), index=[i for i, _ in enumerate(contexts)], columns=features)#
#print("DONE")

In [7]:
len(vectorizer.get_feature_names_out())

170517

In [24]:
def testing(data, m):
    mTokenizer = m['tokenizer']
    mModel = m['model']
    name = m['name']
    file = open(f'./results/{name}__withInjection.json', 'a')
    #finalResult = []
    for i,e in enumerate(data):
        answer = vectorizer.transform([e["question"]])
        arr = cosine_similarity(answer, corpusTFIDF)[0]
        mx = np.amax(arr)
        contextIndex = (np.where(arr == mx))[0][0]
        
        sentences = sent_tokenize(contexts[contextIndex])
        sent = vectorizer.transform(sentences)
        sentSimilarity = [(i, score) for i,score in enumerate(cosine_similarity(answer, sent)[0])]
        
        #quantile filtering
        quant = [np.quantile([e[1] for e in sentSimilarity], q) for q in quantiles]
        for j,q in enumerate(quant):
            searchOn = reduce(
                lambda x, y: f'{x} {y}',
                [ el for i,el in enumerate(sentences) if i in [ e[0] for e in sentSimilarity if e[1]>=q ] ]
            )
            response = text2textModel(tokenizer=mTokenizer, model=mModel, input_text = f'{searchOn}. {e["question"]}')
            response = response.replace('<pad> ', '').replace('</s>', '')
    
            similarityResponseTarget = cosine_similarity(vectorizer.transform([response]), vectorizer.transform(e["targets"]))[0]
                #tg = sorted([ (el, similarityResponseTarget[i]) for i,el in enumerate(e['targets']) ], key=lambda x: x[1], reverse=True)[0]
            
            file.write(f"""
            {{
                "index": {i},
                "question": "{e["question"]}",
                "correctContext": {1 if contexts[contextIndex] == e["context"] else 0},
                "scoreContextRetrieval": {mx},
                "quantileLimitSentences": {quantiles[j]},
                "targetWithSimilarityScore": {[ {el: similarityResponseTarget[i]} for i,el in enumerate(e['targets']) ]},
                "response": "{response}"
            }},""")
            #file.write(f"{e['question']},{contexts[contextIndex] == e['context']},{mx},{quantiles[i]},{tg[0]}, {tg[1]}, {response}\n")                
            #finalResult.append({
            #    'index': i,
            #    'question': e["question"],
            #    'correctContext': contexts[contextIndex] == e["context"],
            #    'scoreContextRetrieval': mx,
            #    'quantileLimitSentences': quantiles[i],
            #    'targetWithSimilarityScore': [ (el, similarityResponseTarget[i]) for i,el in enumerate(e['targets']) ],
            #    'response': response
            #})
    file.close()
    #return finalResult

In [14]:
len(data)

28989

In [None]:
m = models[0]
testing(data=data, m=m)

In [16]:
batchSize = 90

In [17]:
int(len(data) / batchSize)

322

In [None]:
for i in range(1,int(len(data) / batchSize)):
    print(i, (i-1)*90,i*90)

In [None]:
#with injection
m = models[0]
for i in range(1,int(len(data) / batchSize)):
    print(i, (i-1)*90, i*90)
    
    t1 = testing(data=data[(i-1)*90:i*90], m=m)
    output = open(f"results/batches/{m['name']}_injected/{i}.pickle", 'wb')
    pk.dump(t1, output)
    output.close()
    print(f"results/batches/{m['name']}_injected/{i}.pickle saved")

1 0 90
results/batches/flan-t5-small_injected/1.pickle saved
2 90 180


In [None]:
#with injection
m = models[1]
t1 = testing(data=data[:90], m=m)
output = open(f"results/{m['name']}_withInjection.pickle", 'wb')
pk.dump(t1, output)
output.close()
print(f"results/{m['name']}_withInjection.pickle saved")

In [8]:
#without injection
for m in models:
    t2 = []
    for i,e in enumerate(data[:1000]):
        print(i, len(data))
        response = text2textModel(tokenizer=m['tokenizer'], model=m['model'], input_text = e["question"])
        response = response.replace('<pad> ', '').replace('</s>', '')
    
        similarityResponseTarget = cosine_similarity(vectorizer.transform([response]), vectorizer.transform(e["targets"]))[0]
        
        
        t2.append({
            'question': e["question"],
            'targetWithSimilarityScore': [ (el, similarityResponseTarget[i]) for i,el in enumerate(e['targets']) ],
            'response': response
        })
    output = open(f"results/{m['name']}_withoutInjection.pickle", 'wb')
    pk.dump(t2, output)
    output.close()
    print(f"results/{m['name']}_withoutInjection.pickle saved")

0 28989
1 28989
2 28989
3 28989
4 28989
5 28989
6 28989
7 28989
8 28989
9 28989
10 28989
11 28989
12 28989
13 28989
14 28989
15 28989
16 28989
17 28989
18 28989
19 28989
20 28989
21 28989
22 28989
23 28989
24 28989
25 28989
26 28989
27 28989
28 28989
29 28989
30 28989
31 28989
32 28989
33 28989
34 28989
35 28989
36 28989
37 28989
38 28989
39 28989
40 28989
41 28989
42 28989
43 28989
44 28989
45 28989
46 28989
47 28989
48 28989
49 28989
50 28989
51 28989
52 28989
53 28989
54 28989
55 28989
56 28989
57 28989
58 28989
59 28989
60 28989
61 28989
62 28989
63 28989
64 28989
65 28989
66 28989
67 28989
68 28989
69 28989
70 28989
71 28989
72 28989
73 28989
74 28989
75 28989
76 28989
77 28989
78 28989
79 28989
80 28989
81 28989
82 28989
83 28989
84 28989
85 28989
86 28989
87 28989
88 28989
89 28989
90 28989
91 28989
92 28989
93 28989
94 28989
95 28989
96 28989
97 28989
98 28989
99 28989
100 28989
101 28989
102 28989
103 28989
104 28989
105 28989
106 28989
107 28989
108 28989
109 28989
110 28989


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
tokens = lambda text: [x.lemma_ for x in nlp(text) if x.pos_ not in ['PUNCT', 'SPACE'] and not x.is_stop]