### setup vectorestore client


In [2]:
import requests
import time

def search_searx(query, max_retries=3):
    searx_url = 'http://localhost:8002/'
    params = {
        'q': query,
        'format': 'json'
    }

    for attempt in range(max_retries):
        try:
            response = requests.get(searx_url, params=params)
            response.raise_for_status()  # Raise an exception for 4XX or 5XX status codes
            data = response.json()
            return data['results']
        except requests.RequestException as e:
            print(f"Error fetching data (attempt {attempt+1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                print("Retrying after 5 seconds...")
                time.sleep(5)
            else:
                print("Max retries exceeded.")
                return []

def print_search_results(results):
    if results:
        for result in results:
            print(f"Title: {result['title']}")
            print(f"URL: {result['url']}")
            print(f"Content: {result['content']}")
            print(f"Source: {result['engine']}")
            print()
    else:
        print("No results found.")

# Example usage
query = "stencil thickness"
results = search_searx(query)
print_search_results(results)


Error fetching data (attempt 1/3): 429 Client Error: TOO MANY REQUESTS for url: https://search.bus-hit.me/?q=stencil+thickness&format=json
Retrying after 5 seconds...
Error fetching data (attempt 2/3): 429 Client Error: TOO MANY REQUESTS for url: https://search.bus-hit.me/?q=stencil+thickness&format=json
Retrying after 5 seconds...
Error fetching data (attempt 3/3): 429 Client Error: TOO MANY REQUESTS for url: https://search.bus-hit.me/?q=stencil+thickness&format=json
Max retries exceeded.
No results found.


In [1]:
import weaviate
client= weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_port=50051,
    grpc_host="localhost",
    grpc_secure=False,
)
client.is_ready()
client.collections.exists(name="snowflake")


True

In [27]:
from langchain_community.llms import Ollama
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
load_dotenv()

llama_8b = Ollama(model="llama3")
llama_70b= ChatGroq(model="Llama3-70b-8192",temperature=0.4)
mixtral= ChatGroq(model="Mixtral-8x7b-32768",temperature=0.4)

for chunk in llama_8b.stream("Tell me a joke"):
    print(chunk,flush=True)

Here
's
 one
:


Why
 couldn
't
 the
 bicycle
 stand
 up
 by
 itself
?


Because
 it
 was
 two
-t
ired
!


Hope
 that
 made
 you
 laugh
!



### Embeddings

In [30]:
from langchain_community.embeddings import OllamaEmbeddings

In [31]:
nomic = OllamaEmbeddings(model="nomic-embed-text")
mxbai = OllamaEmbeddings(model="mxbai-embed-large")
snowflake = OllamaEmbeddings(model="snowflake-arctic-embed")
#embeddings3 = OllamaEmbeddings(model="snowflake-arctic-embed")

In [32]:
text = "Tell me a joke"
e1 = nomic.embed_query(text)
e2 = mxbai.embed_query(text)
e3 = snowflake.embed_query(text)
#embeddings3 = OllamaEmbeddings(model="snowflake-arctic-embed")


In [33]:
len(e1), len(e2), len(e3)

(768, 1024, 1024)

### Prepare vectorstore

In [37]:
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.indexes import SQLRecordManager,index
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

collection_name = "snowflake"#os.getenv("collection_name")
#collection = client.collections.get(collection_name)
db = WeaviateVectorStore(client=client, index_name=collection_name, embedding=snowflake,text_key="text")

namespace = f"weaviete/{collection_name}"

record_manager = SQLRecordManager(
        namespace, db_url="sqlite:///record_manager_cache.sql",
    )

record_manager.create_schema()

In [2]:
client.collections.delete("web")

In [3]:
client.collections.exists("web")

False

In [38]:
search_kwargs = {'k': 4, "alpha": 0.5 }

# Correct usage: Unpacking the search_kwargs directly into the method call
result = db.similarity_search(query="what is causal relationship", **search_kwargs)

print(result)

[]


### Testing Chunking Strategy

In [39]:
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader("causal_ml_research/LiteratureForEmbedding_EPUseCase/ProjectThesis_of_Dataset.pdf")
docs=loader.load()

In [40]:
import os
for doc in docs:
    doc.metadata["source"]=os.path.basename(doc.metadata["source"])

docs[1].metadata


{'source': 'ProjectThesis_of_Dataset.pdf',
 'file_path': 'causal_ml_research/LiteratureForEmbedding_EPUseCase/ProjectThesis_of_Dataset.pdf',
 'page': 1,
 'total_pages': 148,
 'format': 'PDF 1.7',
 'title': '',
 'author': 'wu16bici',
 'subject': '',
 'keywords': '',
 'creator': 'Microsoft® Word für Microsoft 365',
 'producer': 'Microsoft® Word für Microsoft 365',
 'creationDate': "D:20211130144525+01'00'",
 'modDate': "D:20211130144525+01'00'",
 'trapped': ''}

### process the English docs

In [51]:
import os
from langchain_community.document_loaders import PyMuPDFLoader

def load_pdfs_from_directory(directory):
    # This dictionary will store the filename and its corresponding document content
    loaded_pdfs = {}

    # Walk through all directories and files in the specified directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                # Construct full file path
                file_path = os.path.join(root, file)
                try:
                    # Load the PDF file
                    loader = PyMuPDFLoader(file_path)
                    docs = loader.load()
                    
                    for doc in docs:
                        doc.metadata["source"]=os.path.basename(doc.metadata["source"])
                        
                    status=index(docs,record_manager,db,cleanup="incremental",source_id_key="source")
                    # Store the loaded document in the dictionary
                    loaded_pdfs[file_path] = doc
                    print(f"Loaded: {file_path}---{status}")
                except Exception as e:
                    print(f"Failed to load {file_path}: {str(e)}")

    return loaded_pdfs

# Specify the directory to search for PDF files
directory_path = "causal_ml_research/Data_and_Graph_Preparation/EP_UseCase"
pdf_documents = load_pdfs_from_directory(directory_path)

# You now have a dictionary `pdf_documents` with paths as keys and loaded document contents as values


Loaded: causal_ml_research/Data_and_Graph_Preparation/EP_UseCase/Project Thesis Ben Rachinger.pdf---{'num_added': 148, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}


In [55]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("causal_ml_research/Data_and_Graph_Preparation/USV_UseCase/parameter_description.txt")
docs=loader.load()
index(docs,record_manager,db,cleanup="incremental",source_id_key="source")

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [49]:
# Assuming 'search_kwargs' is a dictionary with search parameters
search_kwargs = {'k': 5, "alpha": 0.5 }

# Correct usage: Unpacking the search_kwargs directly into the method call
result = db.similarity_search(query="stencil thickness", **search_kwargs)
result

[Document(page_content='Fundamentals \n \n28 \n \n \n \n \nFigure 21: SMT assembly process [35] \nSolder paste application: \nSolder paste can be applied to the pads of the PCB by using different processes. While \nstencil printing and screen printing are the most common methods, the paste can also \nbe applied by dispensing or by a jet. \nIn stencil printing a rigid stencil commonly made of stainless steel with a thickness of \naround 150 µm is used. The stencil has apertures which are created by etching, laser \ncutting or in a galvanic process. The stencil is positioned on top of the PCB in direct \ncontact. Solder paste is added onto the stencil and is pushed across the stencil by \nusing a squeegee. The squeegee also generates a downward pressure onto the solder \npaste which pushes it through the stencil apertures onto the pads of the PCB [54].  \nThe main difference of screen printing is that it uses a flexible, fine-meshed screen with \na mesh size of about 55 µm which is not i

In [12]:
retriever=db.as_retriever(search_kwargs={'k': 5,"alpha": 0.1,"filters":None})

### LLaMA 8B

In [78]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from operator import itemgetter
from langchain import hub
from langchain_core.prompts import PromptTemplate

variable1="variable1",
variable2="variable2",

template = """Please assess the likelihood that 
variable '{variable1}' has an influence on variable 
'{variable2}'. Rate the likelihood that '{variable1}' is a cause 
of '{variable2}' on a scale from 0.0 to 1.0, where 0.0 means 
you  are  certain  that  there  is  no  causal  relationship  and  1.0 
means you are certain that there might be a causal 
relationship to some (potentially small) degree. Consider the 
background information I provided as well as the parameter 
description. Use logical reasoning and provide a justification 
for your assessment in two sentences. The answer must start 
with a float between 0.0 and 1.0 followed by the two-sentence 
explanation.
consider the following context before answering:

{context}



Helpful Answer:"""

default_rag_prompt = PromptTemplate.from_template(template)

prompt = hub.pull("rlm/rag-prompt")
llm = llama_70b

retriever=db.as_retriever(search_kwargs={'k': 5,"alpha": 0.1,"filters":None})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

c=default_rag_prompt| {"variable1":RunnablePassthrough(),"variable2":RunnablePassthrough()}
rag_chain = (
    
    {"context":retriever | format_docs,"question":c}
    
    | llm
    | StrOutputParser()
)

In [None]:
from langchain_core.runnables import RunnableParallel
retriever=db.as_retriever(search_kwargs={'k': 5,"alpha": 1,"filters":None})

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llama_70b
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [84]:
rag_chain_with_source.get_graph().print_ascii()

                                          +---------------------------------+                            
                                          | Parallel<context,question>Input |                            
                                          +---------------------------------+                            
                                                   **              **                                    
                                                ***                  ***                                 
                                              **                        **                               
                                +----------------------+            +-------------+                      
                                | VectorStoreRetriever |            | Passthrough |                      
                                +----------------------+            +-------------+                      
                                              

In [81]:
from langchain_core.runnables import RunnableParallel

retriever=db.as_retriever(search_kwargs={'k': 5,"alpha": 1,"filters":None})

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llama_70b
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_chain_with_source.invoke('''Please assess the likelihood that 
variable 'stencilthickness' has an influence on variable 
'solderpastetype'. Rate the likelihood that 'stencilthickness' is a cause 
of 'solderpastetype' on a scale from 0.0 to 1.0, where 0.0 means 
you  are  certain  that  there  is  no  causal  relationship  and  1.0 
means you are certain that there might be a causal 
relationship to some (potentially small) degree. Consider the 
background information I provided as well as the parameter 
description. Use logical reasoning and provide a justification 
for your assessment in two sentences. The answer must start 
with a float between 0.0 and 1.0 followed by the two-sentence 
explanation.''')

/Users/pratikraut/miniconda3/envs/env/lib/python3.10/site-packages/langchain_groq/chat_models.py:432: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/pratikraut/miniconda3/envs/env/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


{'context': [Document(page_content='viii\xa0\n\xa0\nFigure 3.2 Weibull fit of 100 TTFs (in table 3.1) generated by the sample code in \nchapter 5, where N=100, VV=0.1, p=0.0001………………………………………...31 \nFigure 3.3 Weibull fit of the TTFs from (a) run 1, (b) run 2, (c) run 3, (d) run 4, (e) run \n5, (f) run 6, (g) run 7, (h) run 8 shown in table 3.3…………………………………..33 \nFigure 3.4 Half Normal Plot of Absolute effects on Log(η)…………………………36 \nFigure 3.5 Half Normal Plot of Absolute effects on β.................................................36 \nFigure 3.6 C-against-B Plots on affecting shape parameter β......................................37 \nFigure 3.7 Location effects on Log(TTFs)…………………………………………...39 \nFigure 3.8 Dispersion effects on Log(TTFs)…………………………………………39 \nFigure 3.9 Interaction plot of B and C on dispersion effects on Log(TTFs) ..............40 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n', metadata={'subject': '', 'creator': 'PScript5.dll Version 5.2', 'total_pag

In [86]:
from langchain_core.runnables import RunnableParallel
retriever=db.as_retriever(search_kwargs={'k': 5,"alpha": 0.1,"filters":None})

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llama_70b
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_chain_with_source.invoke('''Please assess the likelihood that 
variable 'stencilthickness' has an influence on variable 
'solderpastetype'. Rate the likelihood that 'stencilthickness' is a cause 
of 'solderpastetype' on a scale from 0.0 to 1.0, where 0.0 means 
you  are  certain  that  there  is  no  causal  relationship  and  1.0 
means you are certain that there might be a causal 
relationship to some (potentially small) degree. Consider the 
background information I provided as well as the parameter 
description. Use logical reasoning and provide a justification 
for your assessment in two sentences. The answer must start 
with a float between 0.0 and 1.0 followed by the two-sentence 
explanation.''')

/Users/pratikraut/miniconda3/envs/env/lib/python3.10/site-packages/langchain_groq/chat_models.py:432: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/pratikraut/miniconda3/envs/env/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


{'context': [Document(page_content='Long Term Reliability \n243                              Long Term Reliability \n   \n \n \n \n                  \n      (8.15)                \n, where: \nУ is the dependent variable; \nɑ, the intercept,   \nb, the slope of the line, and \nx is the independent variable. \n \nIt is evident that the linear equation on the chart with the R-squared value of 0.9952 is very \nclose to 1.0 showing a strong correlation. It indicates that the regression line of best fit in the \ngiven figure (Figure 8.16) is a fair estimate of the actual relationship between Concentration \n(x) and Absorbance (y), for the alloying compound evaluated. However, an accurate judgement \nand statistical prediction as to how well a regression line (Srinivasan, Pamula and Fair, 2004) \nrepresents a true relationship require information such as the number of data points collected \n(NC State University, 2004).  \n \n \n \n \nFigure 8.16: An estimation of true relationship between co

In [13]:
from langchain.tools.retriever import create_retriever_tool

tool = create_retriever_tool(
    retriever,name="causal_relation_retriever",
    description="used to retrieve information about causal relationship and likelihood information",
    
    
)
tools = [tool]

In [14]:
from langchain import hub

prompt = hub.pull("hwchase17/openai-tools-agent")
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [22]:
from langchain.agents import AgentExecutor, create_openai_tools_agent
import warnings

warnings.filterwarnings("ignore")

agent = create_openai_tools_agent(llm=llama_70b, tools=tools, prompt=prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools,verbose=True)

In [23]:
agent_executor.invoke({"input": "hi"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mHi! I'm happy to help you with anything you need. What's on your mind today?[0m

[1m> Finished chain.[0m


{'input': 'hi',
 'output': "Hi! I'm happy to help you with anything you need. What's on your mind today?"}

In [27]:
variable1 = "headsegment_id"
variable2 = "placement ydeviation"

default_prompt = f'''Please assess the likelihood that 
variable '{variable1}' has an influence on variable 
'{variable2}'. Rate the likelihood that '{variable1}' is a cause 
of '{variable2}' on a scale from 0.0 to 1.0, where 0.0 means 
you  are  certain  that  there  is  no  causal  relationship  and  1.0 
means you are certain that there might be a causal 
relationship to some (potentially small) degree. Consider the 
background information I provided as well as the parameter 
description. Use logical reasoning and provide a justification 
for your assessment in two sentences. The answer must start 
with a float between 0.0 and 1.0 followed by detailed explaination.
*explaination*-'''

agent_executor.invoke({"input": default_prompt})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m0.8

Based on the provided information, I assess the likelihood that 'headsegment_id' has an influence on 'placement ydeviation' as 0.8. This is because 'headsegment_id' could potentially affect the placement of an object, and therefore, the y-deviation of that placement, but without more specific information about the context and the variables, it's difficult to be certain about the causal relationship.[0m

[1m> Finished chain.[0m


{'input': "Please assess the likelihood that \nvariable 'headsegment_id' has an influence on variable \n'placement ydeviation'. Rate the likelihood that 'headsegment_id' is a cause \nof 'placement ydeviation' on a scale from 0.0 to 1.0, where 0.0 means \nyou  are  certain  that  there  is  no  causal  relationship  and  1.0 \nmeans you are certain that there might be a causal \nrelationship to some (potentially small) degree. Consider the \nbackground information I provided as well as the parameter \ndescription. Use logical reasoning and provide a justification \nfor your assessment in two sentences. The answer must start \nwith a float between 0.0 and 1.0 followed by detailed explaination.\n*explaination*-",
 'output': "0.8\n\nBased on the provided information, I assess the likelihood that 'headsegment_id' has an influence on 'placement ydeviation' as 0.8. This is because 'headsegment_id' could potentially affect the placement of an object, and therefore, the y-deviation of that plac

In [28]:
async for chunk in agent_executor.astream(
    {"input": default_prompt}
):
    # Agent Action
    if "actions" in chunk:
        for action in chunk["actions"]:
            print(f"Calling Tool: `{action.tool}` with input `{action.tool_input}`")
    # Observation
    elif "steps" in chunk:
        for step in chunk["steps"]:
            print(f"Tool Result: `{step.observation}`")
    # Final result
    elif "output" in chunk:
        print(f'Final Output: {chunk["output"]}')
    else:
        raise ValueError()
    print("---")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m0.8 

Based on the provided information, I assess the likelihood that 'headsegment_id' has an influence on 'placement ydeviation' as 0.8. This is because 'headsegment_id' might be related to the physical properties of an object or a system, and 'placement ydeviation' could be a measure of the object's or system's position or alignment, making it plausible that the 'headsegment_id' could have a causal influence on the 'placement ydeviation'.[0m

[1m> Finished chain.[0m
Final Output: 0.8 

Based on the provided information, I assess the likelihood that 'headsegment_id' has an influence on 'placement ydeviation' as 0.8. This is because 'headsegment_id' might be related to the physical properties of an object or a system, and 'placement ydeviation' could be a measure of the object's or system's position or alignment, making it plausible that the 'headsegment_id' could have a causal influence on the 'placement ydeviation'.
---


In [33]:
import arxiv

def get_arxiv_papers(query,num):
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=num,
        sort_by=arxiv.SortCriterion.Relevance
    )
    results = client.results(search)
    arxiv_url = []
    for r in client.results(search):
        arxiv_url.append(r.pdf_url)
    return arxiv_url
urls=get_arxiv_papers("surface mount technology",10)

In [34]:
from langchain_community.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(urls=urls,mode="single",show_progress_bar=True)
t=loader.load()
t

100%|██████████| 10/10 [00:31<00:00,  3.20s/it]


[Document(page_content='8 0 0 2 c e D 1 3\n\n] t e d - s n i . s c i s y h p [\n\n1 v 6 3 1 0 . 1 0 9 0 : v i X r a\n\n∗\n\n∗\n\nFIG. 2: The gray box en\rloses a prototyping grid that was\n\nadded to a temperature \rontroller \rir\ruit. This area \ran be\n\nused for debugging and/or adding \romponents to the board\n\nin\rluding an SOIC \rhip. Power for this area \ran be a\r\ressed\n\nfrom the vias in the upper right \rorner of the (cid:28)gure, whi\rh\n\n±\n\n±\n\nprovide\n\n5 and\n\n12 volts.\n\nplane. Also, as with standard \rir\ruit design, a\rtive \rom-\n\nponents should all be pla\red so that pin 1 is in the same\n\norientation for every IC. Pla\re as many surfa\re mount\n\nomponents on the same side of the board as possible.\n\nOnly one side of the board \ran be soldered at on\re and\n\nany \romponents on the opposite side will have to be sol-\n\ndered by hand.\n\nFinally, we re\rommend in\rluding a small prototyping\n\narea on the PCB where ground, power, and extra pin\n\npads \

In [25]:
import arxiv

arxiv_url = []

class Arxiv_search():
    
    def get_arxiv_papers(self,query,num):
        client = arxiv.Client()
        search = arxiv.Search(
            query=query,
            max_results=num,
            sort_by=arxiv.SortCriterion.Relevance
        )
        results = client.results(search)
        
        for r in client.results(search):
            arxiv_url.append(r.pdf_url)
        return arxiv_url


loader = Arxiv_search()
loader.get_arxiv_papers("surface mount technology",10)


['http://arxiv.org/pdf/0901.0136v1',
 'http://arxiv.org/pdf/2309.17008v1',
 'http://arxiv.org/pdf/2401.12107v2',
 'http://arxiv.org/pdf/2001.09612v1',
 'http://arxiv.org/pdf/2001.09619v1',
 'http://arxiv.org/pdf/2311.10352v1',
 'http://arxiv.org/pdf/2008.04903v1',
 'http://arxiv.org/pdf/2211.14882v1',
 'http://arxiv.org/pdf/2002.01527v1',
 'http://arxiv.org/pdf/1907.04124v2']

In [23]:
from xploreapi import XPLORE
query = XPLORE('api_access_key')
query.queryText('query')
data = query.callAPI()

In [30]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.utilities import SearxSearchWrapper
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.indexes import SQLRecordManager,index
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import weaviate
from langchain_community.embeddings import OllamaEmbeddings
from dotenv import load_dotenv
import os 
import arxiv



load_dotenv()




def get_arxiv_papers(query,num):
        client = arxiv.Client()
        search = arxiv.Search(
            query=query,
            max_results=num,
            sort_by=arxiv.SortCriterion.Relevance
        )
        results = client.results(search)
        arxiv_url = []
        for r in client.results(search):
            arxiv_url.append(r.pdf_url)
        return arxiv_url




def remove_extra_spaces(text):
        """
        This function removes extra white spaces from a string.

        Args:
            text: The string to remove extra white spaces from.

        Returns:
            A string with extra white spaces removed.
        """
        return " ".join(text.split())

docs=[]

class custom_loader:
    
    

    def web_loader(self,query,num=10,arxiv=True ):
        """
        This function loads a web pages into vectordatabase fore rag

        """
##############################initialize the vector store##############################
        
        snowflake = OllamaEmbeddings(model="snowflake-arctic-embed")
        #nomic = OllamaEmbeddings(model="nomic-embed-text")
        #mxbai = OllamaEmbeddings(model="mxbai-embed-large")
        #embeddings3 = OllamaEmbeddings(model="snowflake-arctic-embed")
        client= weaviate.connect_to_custom(
            http_host="localhost",
            http_port=8080,
            http_secure=False,
            grpc_port=50051,
            grpc_host="localhost",
            grpc_secure=False,
        )
        client.is_ready()

        collection_name = "smt_web"#os.getenv("collection_name")
        #collection = client.collections.get(collection_name)
        db = WeaviateVectorStore(client=client, index_name=collection_name, embedding=snowflake,text_key="text")

        namespace = f"weaviete/{collection_name}"

        record_manager = SQLRecordManager(
                namespace, db_url="sqlite:///record_manager_cache.sql",
            )

        record_manager.create_schema()
        
##############################initialize the web search##############################

        
        search = SearxSearchWrapper(searx_host=os.getenv("searx_host"))
        t=search.results(query, num_results=num)
        arxiv_results=get_arxiv_papers(query=query,num=num)
        urls=list(i["link"] for i in t)
        if arxiv:
            urls= urls + arxiv_results
        else:
            urls=urls
        for url in urls:

            loader = UnstructuredURLLoader(urls=[url],mode="single",show_progress_bar=True)
            text=loader.load()
            for i in text:
                string=remove_extra_spaces(i.page_content)
            text_splitter = RecursiveCharacterTextSplitter(
            # Set a really small chunk size, just to show.
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            is_separator_regex=False,
            )
            texts = text_splitter.create_documents([string],metadatas=[{"source":url}])
            print(url)
            status=index(texts,record_manager,db,cleanup="incremental",source_id_key="source")
            print(status)
            print("-----------------------------------")
            docs.extend(texts)
        return "all webpages loaded into database"

       
loader = custom_loader()   
    
loader.web_loader("stencil thickness")



  Base = declarative_base()
  import cgi
100%|██████████| 1/1 [00:01<00:00,  1.99s/it]


https://www.7pcb.com/blog/stencil-thickness-calculations
{'num_added': 5, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:00<00:00,  1.02it/s]


https://smartsmttools.com/test1/
{'num_added': 5, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]


https://www.surfacemountprocess.com/a-guide-to-effective-stencil-design.html
{'num_added': 12, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:04<00:00,  4.79s/it]


https://smtnet.com/library/files/upload/Stencil-Design-Guidelines.pdf
{'num_added': 19, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


https://www.nextpcb.com/blog/smt-stencil
{'num_added': 29, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:05<00:00,  5.09s/it]


https://www.multi-circuit-boards.eu/en/pcb-design-aid/smd-stencils.html
{'num_added': 15, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


https://www.qualiecocircuits.co.nz/stencil-technology-other-aspects.htm
{'num_added': 27, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:00<00:00,  3.92it/s]


https://jlcpcb.com/blog/413-how-to-choose-a-smt-stencil
{'num_added': 10, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:01<00:00,  1.48s/it]


https://www.belfuse.com/resources/ApplicationNotes/PowerSolutions/app-note-BPS-Non-Isolated-DC-DC-Converter-Eutectic-Solder-Process.pdf
{'num_added': 10, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:00<00:00,  1.55it/s]


https://www.mktpcb.com/pcb-stencil/
{'num_added': 18, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:04<00:00,  4.02s/it]


http://arxiv.org/pdf/2210.09999v1
{'num_added': 71, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:08<00:00,  8.10s/it]


http://arxiv.org/pdf/2205.03354v2
{'num_added': 86, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:02<00:00,  2.54s/it]


http://arxiv.org/pdf/2002.05983v1
{'num_added': 59, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:12<00:00, 12.26s/it]


http://arxiv.org/pdf/1605.09737v1
{'num_added': 12, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:01<00:00,  1.04s/it]


http://arxiv.org/pdf/2401.13645v1
{'num_added': 87, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:16<00:00, 16.62s/it]


http://arxiv.org/pdf/2107.13910v2
{'num_added': 75, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:02<00:00,  2.41s/it]


http://arxiv.org/pdf/2009.04619v2
{'num_added': 89, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:02<00:00,  2.27s/it]


http://arxiv.org/pdf/2010.04868v1
{'num_added': 80, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


http://arxiv.org/pdf/1609.04567v1
{'num_added': 55, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


100%|██████████| 1/1 [00:03<00:00,  3.52s/it]


http://arxiv.org/pdf/0802.2674v2
{'num_added': 60, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
-----------------------------------


'all webpages loaded into database'

In [31]:
docs

[Document(page_content='log in my Account Track your order English English Español Française Deutsch 日本語 sales@7pcb.com 1.888.812.1949 Home About Us About Bittele Certificates Why Bittele Mission & Vision Holiday Calendar 2024 Testimonial Careers Trade Shows Conflict Minerals Stmt. Code of Business Conduct Services PCB Fabrication Fabrication Capabilities PCB Materials HDI PCBs Impedance Controlled PCBs PCB Electrical Testing PCB FAQ PCB Quote Online PCB Assembly PCB Assembly Services PCB Assembly Process PCB Assembly FAQ Online Ordering FAQ PCB Assembly Quote Prototype PCB Assembly Low Volume PCB Assembly SMT Assembly BGA Assembly Parts Management IC Programming Functional Testing Quality Resources DFM Guidelines DFA Guidelines How to create a Centroid File How to Export Gerber from Eagle How to Export Gerber from Altium How to Export Gerber from KiCad PCB Glossary Trace Width Calculator Free Passive Parts Blog Contact Us Instant Online Quote and Order Instant Online Quote and Order I

In [7]:
for doc in docs:
    print(doc.page_content)
    
    print("-----------------------------------")

SURFACE MOUNT PROCESS Home Articles FAQ Contract Electronic Manufacturing Resources PRINTED CIRCUIT BOARD MANUFACTURE COMPONENT SUPPLY MACHINE SPARE PARTS Services Contact SOLDER PASTE PRINTING PROCESS One of the most important parts of the surface mount assembly process is the application of solder paste to the printed circuit board (PCB). The aim of this process is to accurately deposit the correct amount onto each of the pads to be soldered. This is achieved by screen-printing the solder paste through a stencil or foil but also can be applied by jet printing. It is widely believed that this part of the process, if not controlled correctly, accounts for the majority of assembly defects. The most common method of applying solder paste to a PCB using a stencil printer is squeegee blade printing – see images below. The squeegees are the tools used to apply the necessary force required to move the solder paste across the stencil and on to the PCB. They are usually made from metal but can

In [1]:
import os
import csv
import pandas as pd 

# Specify the directory containing the text files
directory = 'crewai/outputs/'

# List to hold the data for CSV
data = []

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        # Split the filename to extract variables
        parts = filename.split('-')
        if len(parts) >= 2:
            variable1 = parts[0]
            variable2 = parts[1].split('.')[0]  # Remove the file extension

            # Read the first character from the file to get the score
            with open(os.path.join(directory, filename), 'r') as file:
                score = file.read(3)

            # Append the extracted data to the list
            data.append([variable1, variable2, score])

# Write the data to a CSV file
csv_filename = 'test.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write the header
    csvwriter.writerow(['variable1', 'variable2', 'score'])
    # Write the data rows
    csvwriter.writerows(data)

print(f"Data successfully written to {csv_filename}")
df= pd.read_csv("test.csv")
df

Data successfully written to test.csv


Unnamed: 0,variable1,variable2,score
0,3054_PiezozuGehaeuse_PositionY,3054_Loetpastenpunkt_0_PositionY,0.8
1,3052_Dispenser_2_Masse_Dosierposition_x,2572_SPPC_O_R_s_OTC,0.0
2,3054_PiezozuGehaeuse_PositionY,2572_SPPC_O_R_s_OTC,0.8
3,UVImage_glueing_area,2946_SchaumhoehePins_Schaumhoehe,0.0
4,UVImage_piezoType,3052_Dispenser_2_Masse_Dosierposition_x,0.0
5,UVImage_piezoType,2946_SchaumhoehePins_Schaumhoehe,0.0
6,UVImage_piezoType,3052_Dispenser_2_Masse_Dosierposition_y,0.0
7,UVImage_piezoType,3052_Dispenser_1_Signal_Dosierposition_x,0.0
8,3054_PiezozuGehaeuse_PositionY,UVImage_glueing_area,0.4
9,3052_Dispenser_2_Masse_Dosierposition_y,2946_SchaumhoehePins_Schaumhoehe,0.2


In [3]:
import pandas as pd
from pyvis.network import Network
from IPython.display import IFrame


# Assuming df is your DataFrame and it has a column named 'variable1'
# Create a list of unique nodes from the 'variable1' column
nodes = df['variable1'].unique().tolist()

# Create an instance of the Network class
net = Network(notebook=True,cdn_resources="in_line")

# Add nodes to the network
net.add_nodes(nodes)
net.add_node("2572_SPPC_O_R_s_OTC")
net.node_ids
for i in range(len(df.score)):
    if df.score[i] > 0.0 :
        net.add_edge(df.variable1[i], df.variable2[i])

# Optionally, you can add edges or further customize the network
# net.add_edges([...])  # Add edges if needed

# Show the network in an HTML file
net.show_buttons(filter_=['physics',])
net.show("test.html")



test.html


In [None]:
"""An example program that uses the elsapy module"""

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
from dotenv import load_dotenv
load_dotenv()
    
## Load configuration

## Initialize client
client = ElsClient(api_key=os.getenv('elsakey'))



## Author example
# Initialize author with uri
my_auth = ElsAuthor(
        uri = 'https://api.elsevier.com/content/author/author_id/7004367821')
# Read author data, then write to disk
if my_auth.read(client):
    print ("my_auth.full_name: ", my_auth.full_name)
    my_auth.write()
else:
    print ("Read author failed.")

## Affiliation example
# Initialize affiliation with ID as string
my_aff = ElsAffil(affil_id = '60101411')
if my_aff.read(client):
    print ("my_aff.name: ", my_aff.name)
    my_aff.write()
else:
    print ("Read affiliation failed.")

## Scopus (Abtract) document example
# Initialize document with ID as integer
scp_doc = AbsDoc(scp_id = 84872135457)
if scp_doc.read(client):
    print ("scp_doc.title: ", scp_doc.title)
    scp_doc.write()   
else:
    print ("Read document failed.")

## ScienceDirect (full-text) document example using PII
pii_doc = FullDoc(sd_pii = 'S1674927814000082')
if pii_doc.read(client):
    print ("pii_doc.title: ", pii_doc.title)
    pii_doc.write()   
else:
    print ("Read document failed.")

## ScienceDirect (full-text) document example using DOI
doi_doc = FullDoc(doi = '10.1016/S1525-1578(10)60571-5')
if doi_doc.read(client):
    print ("doi_doc.title: ", doi_doc.title)
    doi_doc.write()   
else:
    print ("Read document failed.")


## Load list of documents from the API into affilation and author objects.
# Since a document list is retrieved for 25 entries at a time, this is
#  a potentially lenghty operation - hence the prompt.
print ("Load documents (Y/N)?")
s = input('--> ')

if (s == "y" or s == "Y"):

    ## Read all documents for example author, then write to disk
    if my_auth.read_docs(client):
        print ("my_auth.doc_list has " + str(len(my_auth.doc_list)) + " items.")
        my_auth.write_docs()
    else:
        print ("Read docs for author failed.")

    ## Read all documents for example affiliation, then write to disk
    if my_aff.read_docs(client):
        print ("my_aff.doc_list has " + str(len(my_aff.doc_list)) + " items.")
        my_aff.write_docs()
    else:
        print ("Read docs for affiliation failed.")

## Initialize author search object and execute search
auth_srch = ElsSearch('authlast(keuskamp)','author')
auth_srch.execute(client)
print ("auth_srch has", len(auth_srch.results), "results.")

## Initialize affiliation search object and execute search
aff_srch = ElsSearch('affil(amsterdam)','affiliation')
aff_srch.execute(client)
print ("aff_srch has", len(aff_srch.results), "results.")

## Initialize doc search object using Scopus and execute search, retrieving 
#   all results
doc_srch = ElsSearch("AFFIL(dartmouth) AND AUTHOR-NAME(lewis) AND PUBYEAR > 2011",'scopus')
doc_srch.execute(client, get_all = True)
print ("doc_srch has", len(doc_srch.results), "results.")

## Initialize doc search object using ScienceDirect and execute search, 
#   retrieving all results
doc_srch = ElsSearch("star trek vs star wars",'sciencedirect')
doc_srch.execute(client, get_all = False)
print ("doc_srch has", len(doc_srch.results), "results.")

In [48]:
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
from dotenv import load_dotenv
import os
load_dotenv()

    
## Load configuration

## Initialize client
client = ElsClient(api_key=os.getenv('elsakey'),num_res=10)
client.local_dir = "./elsa/"

# doc_srch = ElsSearch("sensor",'scopus')
# doc_srch.execute(client, get_all = False)

# pii_doc = FullDoc(sd_pii ='S1674927814000082')
# if pii_doc.read(client):
#     print( json.dumps(pii_doc.data,indent=2))
    
    
     
# else:
#     print ("Read document failed.")
doc_srch = ElsSearch("sensor",'scopus')
doc_srch.execute(client, get_all = False)
for doc in doc_srch.results:
    try :
        doi_doc = FullDoc(doi = doc["prism:doi"])
        if doi_doc.read(client):
            print ("doi_doc ", doi_doc.data["originalText"])
            #doi_doc.write()   
        else:
            print ("Read document failed.")
        #print(doc["prism:doi"])
    except:
        print("No DOI")
        continue


Read document failed.
doi_doc  serial JL 780951 291210 291696 291869 291870 31 90 Cyber Security and Applications CYBERSECURITYAPPLICATIONS 2024-04-14 2024-04-14 2024-04-26 2024-04-26 2024-04-26T09:34:32 1-s2.0-S2772918424000237 S2772-9184(24)00023-7 S2772918424000237 10.1016/j.csa.2024.100057 S250 S250.1 FULL-TEXT 1-s2.0-S2772918424X00028 2024-04-26T08:53:27.886899Z 0 0 20251201 20251231 2025 2024-04-14T01:36:35.335464Z articleinfo articlenumber articletitlenorm authfirstinitialnorm authfirstsurnamenorm cid cids contenttype copyright crossmark dateloaded dateloadedtxt datesearch datesort dateupdated dco docsubtype doctype doi eid ewtransactionid hubeid indexeddate issn issnnorm itemstage itemtransactionid itemweight oauserlicense openaccess openarchive pg pgfirst pii piinorm pubdateend pubdatestart pubdatetxt pubyr sortorder srctitle srctitlenorm srctype ssids alllist content oa subj subheadings suppl tomb volfirst volissue volumelist webpdf webpdfpagecount yearnav figure table body a

In [16]:
pii_doc = FullDoc(sd_pii = 'S1001074224000548')
if pii_doc.read(client):
    print(pii_doc.data)
     
else:
    print ("Read document failed.")

{'coredata': {'prism:url': 'https://api.elsevier.com/content/article/pii/S1001074224000548', 'dc:identifier': 'doi:10.1016/j.jes.2024.01.057', 'eid': '1-s2.0-S1001074224000548', 'prism:doi': '10.1016/j.jes.2024.01.057', 'pii': 'S1001-0742(24)00054-8', 'dc:title': 'Meteorological and traffic effects on air pollutants using Bayesian networks and deep learning ', 'prism:publicationName': 'Journal of Environmental Sciences', 'prism:aggregationType': 'Journal', 'pubType': 'fla', 'prism:issn': '10010742', 'prism:volume': '152', 'prism:startingPage': '54', 'prism:endingPage': '70', 'prism:pageRange': '54-70', 'dc:format': 'application/json', 'prism:coverDate': '2025-06-30', 'prism:coverDisplayDate': 'June 2025', 'prism:copyright': '© 2024 The Research Center for Eco-Environmental Sciences, Chinese Academy of Sciences. Published by Elsevier B.V.', 'prism:publisher': 'The Research Center for Eco-Environmental Sciences, Chinese Academy of Sciences. Published by Elsevier B.V.', 'dc:creator': [{'@

In [29]:

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
from dotenv import load_dotenv
import os
load_dotenv()
    
## Load configuration

## Initialize client
client = ElsClient(api_key=os.getenv('elsakey'))
client.local_dir = "./elsa/"

doc_srch = ElsSearch("sensor",index="scopus")
doc_srch.execute(client, get_all = False)

# for doc in doc_srch.results :
    
#     pii_doc = FullDoc(sd_pii = doc["pii"])
    
#     if pii_doc.read(client):
#         print(pii_doc.data)
#         print("--------------------------------------\n")
     
#     else:
#         print ("Read document failed.")
for doc in doc_srch.results :
    print(doc["prism:doi"])
    doi_doc = FullDoc(doi = doc["prism:doi"])
    if doi_doc.read(client):
        print ( doi_doc.data["originalText"])
         
    else:
        print ("Read document failed.")

10.4152/pea.2025430101
Read document failed.
10.1016/j.csa.2024.100057
serial JL 780951 291210 291696 291869 291870 31 90 Cyber Security and Applications CYBERSECURITYAPPLICATIONS 2024-04-14 2024-04-14 2024-04-26 2024-04-26 2024-04-26T09:34:32 1-s2.0-S2772918424000237 S2772-9184(24)00023-7 S2772918424000237 10.1016/j.csa.2024.100057 S250 S250.1 FULL-TEXT 1-s2.0-S2772918424X00028 2024-04-26T08:53:27.886899Z 0 0 20251201 20251231 2025 2024-04-14T01:36:35.335464Z articleinfo articlenumber articletitlenorm authfirstinitialnorm authfirstsurnamenorm cid cids contenttype copyright crossmark dateloaded dateloadedtxt datesearch datesort dateupdated dco docsubtype doctype doi eid ewtransactionid hubeid indexeddate issn issnnorm itemstage itemtransactionid itemweight oauserlicense openaccess openarchive pg pgfirst pii piinorm pubdateend pubdatestart pubdatetxt pubyr sortorder srctitle srctitlenorm srctype ssids alllist content oa subj subheadings suppl tomb volfirst volissue volumelist webpdf web