In [34]:
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor, LLM
from txtai.pipeline import Textractor

import subprocess
import re
import os
import nltk

In [35]:
llm = LLM("TheBloke/Mistral-7B-OpenOrca-GGUF/mistral-7b-openorca.Q4_K_M.gguf", verbose=False)

In [36]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tahaelhihi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [37]:
def stream(path):
  for f in sorted(os.listdir(path)):
    fpath = os.path.join(path, f)

    if f.endswith(("context994.txt")):
      print(f"Indexing {fpath}")
      yield textractor(fpath)

textractor = Textractor()


embeddings = Embeddings(content=True)
embeddings.index(stream("."))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Indexing ./clustcontext994.txt
Indexing ./regcontext994.txt


In [38]:
def execute(question, context):
  prompt = f"""<|im_start|>system
{context}

<|im_end|>
<|im_start|>user
# Task:
Analyze the input sentence and extract necessary elements according to the instructions given above. 
Make sure the output strictly follows the specified format without any additional words or deviations.
Input:
{question}

<|im_end|>
<|im_start|>assistant
  """

  return llm(prompt, maxlength=4096)


In [39]:
def context(question):
  results = embeddings.search(question)
  if results:
      context = results[0]["text"]
  else:
      context = "No results found"
  return context

def rag(question):
  return execute(question, context(question))
     

In [40]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [60]:
def clean_intermediate(input_string):
    cleaned_string = input_string.replace('\n', ' ')
    cleaned_string = re.sub(r'\s+', ' ', cleaned_string)    
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string.strip())
    cleaned_string = cleaned_string.lower()
    words = cleaned_string.split()
    return '[' + ', '.join(words) + ']'

In [61]:
def call_prolog_parser(input_string):
    command = [
        'swipl',  
        '-g',
        f"phrase(parse(Commands, Attributes), {input_string}), writeln(Commands), writeln(Attributes),halt",  
        '-t',
        'halt(0)',  
        'parser.pl'  
    ]

    result = subprocess.run(command, stdout=subprocess.PIPE, text=True)
    return result.stdout.strip()


In [63]:
def llm_to_AutoML_translator(query): 
    intermediate = rag(query)
    cleaned_intermediate = clean_intermediate(intermediate)
    output = call_prolog_parser(cleaned_intermediate)

    output = output.split('\n')

    print("intermediate:")
    print(intermediate)
    print('\n')
    
    print("AutoML Symbols:")
    print(output[0])
    
    print("Data columns:")
    print(output[1])

In [52]:
llm_to_AutoML_translator("Predict height based on volume through regression.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AutoML Symbols:
Regression, dependent attribute: height, independent attributes: volume.


AutoML Symbols:
[loadData,calculateVolume,splitTrainTestData,knnRegression,regressionMSEError]
Data columns:
[[],[surfaceInSquareMeters,heightInMeters],[],[[volume],height],knnRegression_OUTPUT]


In [53]:
llm_to_AutoML_translator("Given the objects dataset, how many types of objects can be found, according to their volume?")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AutoML Symbols:
Clustering attributes: [volume]


AutoML Symbols:
[loadData,calculateVolume,clusterData,clusterCount]
Data columns:
[[],[surfaceInSquareMeters,heightInMeters],[volume],clusterData_OUTPUT]


In [64]:
llm_to_AutoML_translator("Train a model that can estimate volume based on surface given the objects dataset.") 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


intermediate:
Regression: dependent attribute: volume, independent attributes: surface.


AutoML Symbols:
[loadData,calculateVolume,splitTrainTestData,knnRegression,regressionMSEError]
Data columns:
[[],[surfaceInSquareMeters,heightInMeters],[],[[surface],volume],knnRegression_OUTPUT]


In [55]:
llm_to_AutoML_translator("How many different types of objects based on volume and height.") 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AutoML Symbols:
Clustering attributes: volume, height


AutoML Symbols:
[loadData,calculateVolume,clusterData,clusterCount]
Data columns:
[[],[surfaceInSquareMeters,heightInMeters],[volume,height],clusterData_OUTPUT]


In [56]:
llm_to_AutoML_translator("Train a model that can estimate mass based on surface given the objects dataset.") 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AutoML Symbols:
Regression:
dependent attribute: mass,
independent attributes: surface.


AutoML Symbols:
[loadData,splitTrainTestData,knnRegression,regressionMSEError]
Data columns:
[[],[],[[surface],mass],knnRegression_OUTPUT]


In [57]:
llm_to_AutoML_translator("Cluster objects based on mass.") 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AutoML Symbols:
Clustering attributes: mass.


AutoML Symbols:
[loadData,clusterData,clusterCount]
Data columns:
[[],[mass],clusterData_OUTPUT]


In [58]:
llm_to_AutoML_translator("Train a model that can estimate volume based on surface and height given the objects dataset.") 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AutoML Symbols:
Regression, dependent attribute: volume, independent attributes: surface, height.


AutoML Symbols:
[loadData,calculateVolume,splitTrainTestData,knnRegression,regressionMSEError]
Data columns:
[[],[surfaceInSquareMeters,heightInMeters],[],[[surface,height],volume],knnRegression_OUTPUT]


In [65]:
 llm_to_AutoML_translator("Train a model that can estimate volume based on density given the objects dataset.") 

intermediate:
Regression, dependent attribute: volume, independent attributes: density.


AutoML Symbols:
[loadData,calculateVolume,calculateDensity,splitTrainTestData,knnRegression,regressionMSEError]
Data columns:
[[],[surfaceInSquareMeters,heightInMeters],[massInKilograms,surfaceInSquareMeters],[],[[density],volume],knnRegression_OUTPUT]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
