In [1]:
import os
import shutil
import tempfile

import requests
from bs4 import BeautifulSoup
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
from termcolor import colored
import re
from langchain.prompts import PromptTemplate
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter

import mlflow

In [2]:
def create_faiss_database(document_path, database_save_directory, chunk_size=500, chunk_overlap=50, batch_size=100):
    # Load text
    loader = TextLoader(document_path)
    raw_documents = loader.load()
    print(f"Loaded {len(raw_documents)} document(s)")

    # Smart splitting
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_documents(raw_documents)
    print(f"Split into {len(chunks)} chunks")

    # Embedding in batches
    embedding_model = OpenAIEmbeddings()
    faiss_db = None
    for i in tqdm(range(0, len(chunks), batch_size), desc="Embedding chunks"):
        batch = chunks[i:i+batch_size]
        if faiss_db is None:
            faiss_db = FAISS.from_documents(batch, embedding_model)
        else:
            faiss_db.add_documents(batch)

    faiss_db.save_local(database_save_directory)
    return faiss_db



def print_answer_formatted(answer, max_line_length=100):
    """
    Prints the answer with the following requirements:
    1. Max length of each line is 160.
    2. <think> ... </think> content is printed in a light color.
    3. After <think> content, print 2 empty lines.
    """
    # Extract <think> ... </think> content
    think_match = re.search(r"<think>(.*?)</think>", answer, re.DOTALL)
    if think_match:
        think_content = think_match.group(1).strip()
        rest_content = answer.replace(think_match.group(0), "").strip()
    else:
        think_content = ""
        rest_content = answer

    # Helper to print with max line length
    def print_wrapped(text, color=None):
        words = text.split()
        line = ""
        for word in words:
            if len(line) + len(word) + 1 <= max_line_length:
                line += word + " "
            else:
                if color:
                    print(colored(line.rstrip(), color))
                else:
                    print(line.rstrip())
                line = word + " "
        if line:
            if color:
                print(colored(line.rstrip(), color))
            else:
                print(line.rstrip())

    # Print <think> content in light color (e.g., 'cyan')
    if think_content:
        print_wrapped(think_content, color="cyan")
        print("\n")

    # Print the rest
    if rest_content:
        print_wrapped(rest_content)

In [3]:
# temporary_directory = tempfile.mkdtemp()
faiss_directory = "faiss_database/CMAPSS_FD001/"
os.makedirs(faiss_directory, exist_ok=True)

# doc_path = os.path.join(temporary_directory, "docs.txt")
doc_path = "local_text/CMAPSS_FD001.txt"
persist_dir = os.path.join(faiss_directory, "faiss_index")

# fetch_and_save_documents(url_listings, doc_path)

# Use smaller chunk size to avoid exceeding token limits
# vector_db = create_faiss_database(doc_path, persist_dir,chunk_size=500, batch_size=100)

In [4]:
mlflow.set_experiment("Ollama RAG")
mlflow.openai.autolog()


code_path = "ollama_pyfunction.py"
example_input = {"query": "What is the CMAPSS dataset?"}
with mlflow.start_run() as run:
    model_info = mlflow.pyfunc.log_model(
        name="deepseek-r1-8b-CMAPSS",
        python_model=code_path,
        artifacts={
            "persist_directory": persist_dir,
        },
        input_example=example_input,
    )


2025/07/20 22:20:49 INFO mlflow.pyfunc: Inferring model signature from input example


Downloading artifacts:   0%|          | 0/2 [00:00<?, ?it/s]

  "inputs": {
    "query": "What is the CMAPSS dataset?"
  }
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: argument 'text': 'ndarray' object cannot be converted to 'PyString'
2025/07/20 22:20:58 INFO mlflow.models.model: Found the following environment variables used during model inference: [OPENAI_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


In [5]:
model_info.model_uri

'models:/m-e1b55123a0e24e208f62b2fe53ed9c91'

In [6]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)



In [7]:
query1 = {"query": "What is CMAPSS?"}
answer1 = loaded_model.predict(query1)
print_answer_formatted (answer1['result'])

[36mHmm, the user has provided several lines of context from what looks like a dataset, all starting[0m
[36mwith "96" and including engine IDs, time stamps, operation data, and sensor readings. They want me[0m
[36mto explain what CMAPSS is based on this information. Looking at these examples, I can see that each[0m
[36mrow contains an engine ID (first column), time of reading (second column), operation status in[0m
[36mcolumns 3-4 or more depending on the dataset snippet, and then various sensor values. This appears[0m
[36mto be a dataset related to machine performance monitoring, specifically for engines with multiple[0m
[36msensors tracking different parameters over time. CMAPSS is likely an acronym - perhaps it stands[0m
[36mfor something like China Manufacturer of Air Piston Engines Propulsion System Simulation? Given[0m
[36mthat all these data points belong to engine 96 (from the examples), this would be a simulation[0m
[36mdataset rather than real operational d

In [9]:
vector_db.as_retriever().get_relevant_documents("What is GCPATr?")

  vector_db.as_retriever().get_relevant_documents("What is GCPATr?")


[Document(id='0e6f8bef-d217-4ba1-874f-603842c02340', metadata={'source': 'local_text/paper.txt'}, page_content='In this work, we have used the GCPATr architecture that incorporates both graph convolution and position awareness into the standard transformer architecture. The overall processing pipeline of GCPATr is shown in Figure \\ref{Overall_GCPATr_Pipeline}. \n\nThe position aware spectral graph self-attention block is shown in Figure \\ref{SG_PA_SA_block} and the position aware self-attention block of PATr is shown in Figure \\ref{PA_SA_block}. Different from the position aware self-attention block of PATr \\cite{chattopadhyay2024position}, the linear query, key and value extractors have been replaced with spectral graph convolution operations in GCPATr. Other than this difference the working of the two blocks is the same.'),
 Document(id='895cf545-4560-4354-80da-d2344f4a2ae0', metadata={'source': 'local_text/paper.txt'}, page_content='The essence of the transformer architecture is

In [8]:
answer2 = loaded_model.predict({"query": "Can you summerize this dataset?"})
print_answer_formatted (answer2['result'])

[36mOkay, the user wants me to summarize a CMAPSS dataset that appears to be related to engine[0m
[36mperformance monitoring and predicting Remaining Useful Life (RUL). From the context provided, I can[0m
[36msee this is likely time-series sensor data for multiple engines over time. Each row seems to[0m
[36mrepresent a different measurement point with several key components: - First column: Engine ID -[0m
[36mSecond column: Time of reading - Third and fourth columns: Some operation-related values (always[0m
[36m100.0 in these examples) - Remaining columns: Various engine sensor readings Hmm, looking at the[0m
[36mdata patterns across multiple contexts, I notice some interesting characteristics about this[0m
[36mdataset: The first two rows for each engine ID show very similar values except for slight[0m
[36mvariations that could indicate real operational changes or measurement noise. There's a clear[0m
[36mpattern where certain columns remain constant (like the third 

In [10]:
answer3 = loaded_model.predict({"query": "I sample a sensor data from another machine. Depends on the dataset I give you, can you predict it's RUL?i,e. the second column of the training dataset.  1 31 -0.0006 0.0004 100.0 518.67 642.58 1581.22 1398.91 14.62 21.61 554.42 2388.08 9056.40 1.30 47.23 521.79 2388.06 8130.11 8.4024 0.03 393 2388 100.00 38.81 23.3552"})
print_answer_formatted (answer3['result'])

[36mOkay, let me try to figure this out. The user provided some context about the CMAPSS dataset and[0m
[36mthen asked if I can predict the Remaining Useful Life (RUL) of a sampled sensor data from another[0m
[36mmachine. First, looking at the context, it seems like there's a structure in each row. The first[0m
[36mcolumn is an engine ID, the second is time, columns 3-5 are operation-related info, and then there[0m
[36mare many more columns for sensor readings. So RUL prediction probably uses all these features[0m
[36mexcept maybe the time or ID since we're predicting future life based on current state. But wait,[0m
[36mhow does one row relate to another? The question mentions "the second column of the training[0m
[36mdataset," which is likely the target variable (RUL). However, without knowing what that actually[0m
[36mrepresents in each sample—like whether it's a label for how much more time until failure or[0m
[36msomething else—it's hard. Maybe the user assumes I

In [10]:
answer3 = loaded_model.predict({"query": "Can you repeat my previous question?"})
print_answer_formatted (answer3['result'])

I don't know what your previous question was. I have no memory of past interactions. Please provide
the question you've asked before.


In [None]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

In [None]:
model_info.model_uri

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)