In [1]:
from datasets import load_dataset

dataset_name = "lamini/lamini_docs"
datasets = load_dataset(dataset_name)

In [3]:
data = []

for qa_pair in datasets['train']:
    _sentence = (qa_pair['question'] + ' ' + qa_pair['answer']).replace("\\n", " ")
    data.append(_sentence)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def top_tfidf_words(sentences, top_n=100):
    # Initialize a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')

    # Fit and transform the sentences
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Sum TF-IDF for each term across all documents
    sums = tfidf_matrix.sum(axis=0)

    # Create a dataframe with words and their corresponding sums
    data = []
    for col, term in enumerate(feature_names):
        data.append((term, sums[0, col]))

    ranking = pd.DataFrame(data, columns=['term', 'rank'])
    ranking = ranking.sort_values('rank', ascending=False)

    return ranking

# Get top 100 TF-IDF words
top_words = top_tfidf_words(data)
print(top_words)


             term        rank
1848       lamini  103.728120
3253         text   54.182434
834          data   47.727913
2100        model   41.152211
142            ai   39.693581
...           ...         ...
3523     welcomes    0.066917
273     attending    0.066917
816     curiosity    0.066917
2385        plays    0.066917
888   departments    0.066917

[3574 rows x 2 columns]


In [5]:
glove_words = []
with open("/nethome/ss651/Robust-LLM/intent/glove.6B.100d.txt", "r") as f:
    for line in f:
        word = line.split()[0]
        glove_words.append(word)

important_words = top_words["term"].tolist()

filtered_words = []

for word in important_words:
    if word not in glove_words:
        filtered_words.append(word)

filtered_words =  sorted(filtered_words)

In [6]:
import pandas as pd

df = pd.read_csv("train_ners.csv")
ner_list = list(df["NERS"].dropna())

ners = []
for ner in ner_list:
    ners.extend(ner.lower().replace("`","").split(", "))

filtered_ners = []
for ner in ners:
    if ner not in glove_words:
        filtered_ners.append(ner)

In [14]:
import csv

with open('important_words.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for _word in filtered_words:
        writer.writerow([_word])

In [7]:
import re

document_content = ' '.join(data)

# Regex pattern for function calls
pattern = r'\b[a-zA-Z0-9]+\w*_+\w+?(?<!\.py)\b|\b[a-zA-Z0-9]+\w+\(\)(?<!\.py\b)'

entities = re.findall(pattern, document_content)
entities = sorted(list(set(entities)))


In [19]:
for x in data:
    if  'test_output_str' in x:
        print(x)

Is it possible to customize the level of specificity in the generated output? Yes, it is possible to customize the level of specificity in the generated output. This can be achieved by adjusting the input parameters and output type in the LLM Engine function, as demonstrated in the "TestOutputStr" class in the "test_output_str.py" file. By defining specific input parameters and output types, the generated output can be tailored to meet the desired level of specificity.


In [32]:
filtered_entities = [
 'add_data',
 'add_improve_statements',
 'add_metric',
 'add_model',
 'bad_examples',
 'cancel_job',
 'check_job_status',
 'circular_operation',
 'compare_equal_metric',
 'configure_llama',
 'edit_config',
 'error_handling',
 'filter_fn',
 'full_balance_dataset',
 'gen_queue_batch',
 'gen_submit_training_job',
 'get_job_results',
 'get_response',
 'good_examples',
 'improve()',
 'is_peft_model',
 'length_penalty',
 'llm()',
 'make_discriminator',
 'make_questions',
 'max_retries',
 'max_tokens',
 'model_name',
 'parse_response',
 'repetition_penalty',
 'run_all',
 'sample()',
 'stochastic_balance_dataset',
 'submit_job',
 'test_cache',
 'test_output_str',
 'test_parallel_complex',
 'test_parallel_simple',
 'value_to_dict',
 'write_story']

In [33]:
context = []

for func in filtered_entities:
    _context = []
    for qa in data:
        if func in qa:
            _context.append(qa)
    context.append([func, ' '.join(_context)])

import pickle as pkl
pkl.dump(context, open("context.pkl", "wb"))

In [34]:
import csv
file_path = 'context.csv'

# Writing to a CSV file
with open(file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Entity", "Context"])
    writer.writerows(context)


In [46]:
summary = pkl.load(open("summary.pkl", "rb"))

file_path = 'summary.csv'

# Writing to a CSV file
with open(file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Entity", "Summary"])
    writer.writerows(summary)
