In [89]:
import pandas as pd
import os 
allncs = "All NCs (New) Non-conformance data YTD.xlsx"
allncs = pd.read_excel(allncs)
allncs.head()

Unnamed: 0,SupplierName,NonconformanceNumber,Disposition,FiscalPeriod,DispositionQty,PartNumber,PartName,NonconformanceDescription,PONumber,Pattern,UOM,NONCONFORMANCE_SOURCE,SiteNo,SiteName
0,,NC000508022,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",Drill bit stuck in plug/stem,,,EA,INPROCESS,123,Sherman
1,,NC000508205,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",stem was pulled wrong on job and already drilled,,,EA,INPROCESS,123,Sherman
2,,NC000507834,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",Broke carbide drill bit in stem and plug,,,EA,INPROCESS,123,Sherman
3,,NC000507094,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",Threads damaged during assembly.,,,EA,INPROCESS,123,Sherman
4,,NC000507350,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",drill bit broke off insde,,,EA,INPROCESS,123,Sherman


In [71]:
import ollama
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
allncs.columns, len(allncs)
print(f"Total nc's in Study : {len(allncs)}")
ollama.list()
columns = allncs.columns
nltk.download('stopwords')
nltk.download('wordnet')

Total nc's in Study : 26017


[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from tqdm import tqdm
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Text cleaning function
def clean_text(text):
   text = re.sub(r'\W', ' ', str(text))
   text = re.sub(r'\s+', ' ', text)
   text = text.lower()
   return text

def process_description(description):
   cleaned_text = clean_text(description)
   return cleaned_text

cols = ['SupplierName',]
def prepare_data(df,numrows= 1000):
    """
    Helper function to prepare data and return a new DataFrame with 'nc', 'vector',
    'NonconformanceNumber', and 'PartNumber' columns.
    :param df: Input pandas DataFrame.
    :return: DataFrame with four columns: 'nc', 'vector', 'NonconformanceNumber', 'PartNumber'.
    """
    # Create an empty list to store the processed data
    data = []
    if numrows:
        subset_df = df.iloc[:numrows]
    else:
        numrows = len(df)
        subset_df = df.iloc[:numrows]

    # Iterate through each row in the DataFrame
    for index, row in tqdm(subset_df.iterrows(), total=numrows):
        # Convert the row to a dictionary where the column names are keys
        row_dict = {col: str(row[col]) for col in df.columns}
        
        # Extract specific columns for 'NonconformanceNumber' and 'PartNumber' if they exist
        nonconformance_number = row_dict.get('NonconformanceNumber', '')
        part_number = row_dict.get('PartNumber', '')

        # Flatten the dictionary into a string for embeddings
        flat_text = ', '.join([f"{key}: {value.replace('nan', '')}" for key, value in row_dict.items()])
        
        # Generate the embedding using the provided model
        embedding = ollama.embed(model='wizardlm2', input=[flat_text])['embeddings'][0]

        # Append the data to the list, including NonconformanceNumber and PartNumber
        data.append({
            'nc': flat_text,
            'vector': embedding,
            'NonconformanceNumber': nonconformance_number,
            'PartNumber': part_number
        })
    
    # Convert the list of dictionaries into a DataFrame
    result_df = pd.DataFrame(data, columns=['nc', 'vector', 'NonconformanceNumber', 'PartNumber'])
    
    return result_df

allncs['NonconformanceDescription'] = allncs['NonconformanceDescription'].apply(process_description)
allncs.head()
#vectorizedncs = prepare_data(allncs)
#vectorizedncs.head()


Unnamed: 0,SupplierName,NonconformanceNumber,Disposition,FiscalPeriod,DispositionQty,PartNumber,PartName,NonconformanceDescription,PONumber,Pattern,UOM,NONCONFORMANCE_SOURCE,SiteNo,SiteName
0,,NC000508022,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",drill bit stuck in plug stem,,,EA,INPROCESS,123,Sherman
1,,NC000508205,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",stem was pulled wrong on job and already drilled,,,EA,INPROCESS,123,Sherman
2,,NC000507834,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",broke carbide drill bit in stem and plug,,,EA,INPROCESS,123,Sherman
3,,NC000507094,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",threads damaged during assembly,,,EA,INPROCESS,123,Sherman
4,,NC000507350,SCRAP,P09,1,1U388835162,"STEM,PLUG ~ VSC1L3750,3/8X 8.88,2.38THD",drill bit broke off insde,,,EA,INPROCESS,123,Sherman


## CREATE A LANCEDB TABLE OUT OF THE VECTORIXED NCS

In [10]:
import lancedb
import pandas as pd
import pyarrow as pa
uri = "data/ncs-lancedb"
db = lancedb.connect(uri)
#db.drop_table("vectorizedncs")
vectorizedncs_tbl = db.create_table("vectorizedncs", vectorizedncs)

## Table for the Human-in-Loop feedback 

In [11]:
import pyarrow as pa
# Define the schema using pyarrow
HITLschema = pa.schema([
    ("id", pa.string()),         # Unique identifier (non-nullable)
    ("content", pa.string()),     # Nullable content field
    ("rating", pa.int32()),      # Non-nullable rating
    ("comment", pa.string()),     # Nullable comment
    ("timestamp", pa.string()),  # Non-nullable timestamp
    ("vector", pa.list_(pa.float32())),  # Nullable vector field
])

if "HITL" not in db.table_names():
    table = db.create_table(
        "HITL",
        schema=HITLschema
    )

ZeroDivisionError: integer division or modulo by zero

In [None]:
print("LanceDB Table Schema:", hitlfeedback_table.schema())

In [112]:
pip install fastapi uvicorn pydantic

165083.64s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting fastapi
  Using cached fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.2-py3-none-any.whl.metadata (6.0 kB)
Downloading fastapi-0.115.5-py3-none-any.whl (94 kB)
Downloading starlette-0.41.2-py3-none-any.whl (73 kB)
Installing collected packages: starlette, fastapi
Successfully installed fastapi-0.115.5 starlette-0.41.2
Note: you may need to restart the kernel to use updated packages.


## MAKE vectorizedncs TABLE AVAILABLAE FOR SEARCH 

In [1]:
# Asynchronous client
import lancedb
import pandas as pd
import pyarrow as pa
uri = "data/ncs-lancedb"
db = lancedb.connect(uri)
vectorizedncs_tbl = db.open_table("vectorizedncs")
HITL_tbl = db.open_table("HITL")


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
_embedding = ollama.embed(model='wizardlm2', input=["this is a long story"])

In [39]:
len(_embedding["embeddings"][0])
_embedding["embeddings"][0]

[0.014314272,
 0.0075008026,
 0.0029022475,
 -0.011569215,
 -0.00016692483,
 5.4083674e-05,
 -0.019224642,
 -0.0049812305,
 -0.0033825291,
 0.0019008218,
 0.019404152,
 0.0049391156,
 0.005652264,
 0.018642142,
 0.0037362059,
 0.023412444,
 -0.009780672,
 0.0070514977,
 0.008344694,
 0.0019607856,
 0.008302369,
 -0.0021478292,
 0.012837712,
 -0.008198916,
 -0.012597016,
 0.0078319395,
 0.016328542,
 -0.012091717,
 -0.009265471,
 -0.00492631,
 0.00080683164,
 0.008593105,
 -0.008854711,
 0.003795029,
 0.011158671,
 -0.0063165193,
 0.011127091,
 0.019033356,
 -0.0017751795,
 0.015734479,
 0.0051869573,
 -0.0022567252,
 -0.004385228,
 -0.005386533,
 0.0027583619,
 0.020028526,
 -0.0038013859,
 -0.028517688,
 -0.0035837106,
 -0.011683618,
 0.0027481436,
 -0.0056212354,
 0.0049157618,
 -0.020366874,
 -0.0008573796,
 0.008344317,
 -0.0073084272,
 -0.013116626,
 0.016908495,
 -0.019120092,
 -0.0066941553,
 -0.008744865,
 0.0049263635,
 0.023335507,
 -0.0067921695,
 -0.004312931,
 0.007695846,

## do a search on the table

In [73]:
#vectorizedncs_tbl.schema
vectorizedncs_tbl.search("select first 10 rows").limit(10).to_list()

[{'nc': 'SupplierName: , NonconformanceNumber: NC000497170, Disposition: SCRAP, FiscalPeriod: P07, DispositionQty: 10, PartNumber: 24810, PartName: DIAPHRAGM, 32IN2 ACTUATOR, NonconformanceDescription: moldeo y prueba de parámetros pruebas first article, PONumber: , Pattern: , UOM: EA, NONCONFORMANCE_SOURCE: , SiteNo: 106, SiteName: Toluca',
  'vector': [0.0054954644292593,
   0.0027982930187135935,
   0.00046255733468569815,
   0.0022560711950063705,
   0.0003444157191552222,
   0.010802621953189373,
   -0.011346657760441303,
   0.011828010901808739,
   0.012683196924626827,
   -0.008006161078810692,
   0.004582981113344431,
   -0.004733576439321041,
   -0.004286200739443302,
   0.00645057437941432,
   0.006623947061598301,
   -0.004264120478183031,
   0.002045430475845933,
   0.0007514304015785456,
   0.008906022645533085,
   0.012670693919062614,
   -0.019856978207826614,
   0.015448139980435371,
   -0.0037881871685385704,
   0.02904803492128849,
   -0.0006012838566675782,
   -0.005

In [59]:
vectorizedncs_tbl.create_index(metric="cosine", vector_column_name="vector")


ValueError: dimension (4096) must be divisible by num_sub_vectors (96)

In [None]:
import ollama
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import RRFReranker
topics ='Drill bit stuck'
question = f"{topics}"
reranker = RRFReranker()
pd.set_option('display.max_colwidth', None)
# Embed Question
query_embedding = ollama.embed(model='wizardlm2', input=question)
# Semantic Search
results = vectorizedncs_tbl.search(query_embedding['embeddings'][0]).limit(5).to_pandas()
# results = vectorizedncs_tbl.search(question, query_type="fts").limit(5).select(["nc"]).to_list()
# display(results[["nc","_distance"]])


# results = (
#     vectorizedncs_tbl.search(
#         question,
#         query_type="hybrid",
#         vector_column_name="vector",
#         fts_columns="nc",
#     )
#     .rerank(reranker)
#     .limit(10)
#     .to_pandas()
# )
display(results)

ValueError: 
                The query used for vector search is not a string.
                In this case, the reranker query needs to be specified explicitly.
                

In [85]:
print(vectorizedncs_tbl.schema)

nc: string
vector: fixed_size_list<item: float>[4096]
  child 0, item: float
NonconformanceNumber: string
PartNumber: string


In [27]:
vectorizedncs_tbl.create_fts_index("nc")

In [20]:
import ollama
topic = 'Welding'
id = 1
print(rows[id])
#row = "SupplierName: INSPECTECH CORP, NonconformanceNumber: NC000475110, Disposition: SCRAP, FiscalPeriod: P02, DispositionQty: 1, PartNumber: N902882Q32972, PartName: BONNET, NonconformanceDescription: film for rt was received from inspectech on july 31 2023 and was rejected job orders since oracle have all the reviews occurring at the close of the order instead of real time so we were not made aware of this fact until the film was reviewed on oct 18 2023 the result was losing 2 ½ months of recovery time r westberg, PONumber: 4290000937, Pattern: , UOM: , NONCONFORMANCE_SOURCE: SUPPLIER, SiteNo: Couldn't find SiteNo for: mansfield, SiteName: mansfield"
result = ollama.chat(model='wizardlm2', messages=[{'role': 'user', 'content': f'read - {rows[id]},Do not give any analysis, strictly answer in YES or NO (DONT ADD ANY ADDITIONAL INFORMATION) , tell me if it is related to {topic}?'}])
result

SupplierName: , NonconformanceNumber: NC000502471, Disposition: RTV, FiscalPeriod: P08, DispositionQty: 11, PartNumber: N902683Q32960, PartName: ADJ BOLT, NonconformanceDescription: all 11 adjusting bolts are missing the 125 depth dimension these 4 would not take go ring thread pitch is over max 3 9439 min 3 9374 ab073856 002 over ab073858 001 over ab073859 0003 over ab073860 0003 over , PONumber: , Pattern: , UOM: , NONCONFORMANCE_SOURCE: SUPPLIER, SiteNo: Couldn't find SiteNo for: mansfield, SiteName: mansfield


{'model': 'wizardlm2',
 'created_at': '2024-12-07T15:07:27.15227Z',
 'message': {'role': 'assistant',
  'content': 'NO, the nonconformance described does not specifically relate to welding based on the information provided. It pertains to the depth dimension of the adjusting bolts being out of the specified range and the go ring thread pitch not within the maximum allowed.'},
 'done_reason': 'stop',
 'done': True,
 'total_duration': 3073137833,
 'load_duration': 9168708,
 'prompt_eval_count': 297,
 'prompt_eval_duration': 1269000000,
 'eval_count': 52,
 'eval_duration': 1791000000}

In [12]:
result

{'model': 'llama3.2',
 'created_at': '2024-12-07T14:59:51.336671Z',
 'message': {'role': 'assistant',
  'content': "Based on the information provided, it appears that there is a nonconformance (NC000475110) from INSPECTECH CORP regarding a film for RT. The disposition of this nonconformance is SCRAP with 1 unit disposed of.\n\nThe reason for this nonconformance was due to a delay in receiving reviews from Oracle. Normally, reviews would be available at the close of the order, but due to this issue, it wasn't until October 18, 2023, that the film was reviewed and found to have resulted in losing 2½ months of recovery time.\n\nThere is also another nonconformance (PONumber: 4290000937) related to a bonnet (PartNumber: N902882Q32972). This nonconformance seems unrelated to welding as the pattern, UOM (Unit of Measure), and NONCONFORMANCE_SOURCE (Supplier) are not specified.\n\nHowever, without more information about the specific issue with the bonnet, it's difficult to determine if it's r

In [24]:
import requests
import json
def RA(messages, model):
    r = requests.post(
        "http://127.0.0.1:11434/api/chat",
        json={"model": model, "messages": messages, "stream": True},
        stream=True
    )
    r.raise_for_status()
    output = ""

    for line in r.iter_lines():
        body = json.loads(line)
        if "error" in body:
            raise Exception(body["error"])
        if body.get("done") is False:
            message = body.get("message", "")
            content = message.get("content", "")
            output += content
        if body.get("done", False):
            message["content"] = output
            return message

In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict
import requests
import json

# Request model for API
class InsightsRequest(BaseModel):
    rows: List[Dict[str, str]]  # List of rows with non-conformance data
    question: str               # User's question
    model: str = "wizardlm2"        # Default model to use

# Route to get insights
def get_insights(request: InsightsRequest):
    """
    Fetch insights from the RA function based on provided non-conformance data and user question.

    Args:
        request (InsightsRequest): Contains the 20 rows of data, user question, and model.

    Returns:
        dict: Insights generated by the RA function.
    """
    try:
        # Prepare the prompt
        rows_context = "\n".join(
            [f"{idx+1}. {json.dumps(row)}" for idx, row in enumerate(request.rows)]
        )
        messages = [
            {
                "role": "system",
                "content": (
                    "You are an intelligent assistant specializing in analyzing non-conformance data. "
                    "Use the data provided as context to answer the question accurately."
                )
            },
            {
                "role": "user",
                "content": (
                    f"### Non-Conformance Data:\n{rows_context}\n\n"
                    f"### User Question:\n{request.question}\n\n"
                    "Provide a detailed answer based on the provided data."
                )
            }
        ]

        # Call the RA function
        response = RA(messages=messages, model=request.model)

        # Return the insights
        return {"insights": response["content"]}

    except requests.exceptions.RequestException as e:
        raise HTTPException(status_code=500, detail=f"Error communicating with RA API: {str(e)}")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")

In [None]:
import ollama
context = "".join(context)
question = 'Identify the part that have recurrent quality issues'
envelope = f"You are an intelligent assistant specializing in analyzing non-conformance data. i will provide the context of few non-conformances and related details {context} and a user question {question}. Use the data provided as context to answer the question as accurately as possible. Do not include information that is not found in the provided context Ensure the response is concise and directly answers the question."
messages = [
            {"role": "user", "content": envelope}
        ]
response = RA(messages,'wizardlm2')
print(response['content'])


Based on the data provided, it appears that there are several parts with recurrent quality issues. Here are some of the parts that have been identified with multiple nonconformances:

1. **Extension Bonnet (Part Number 24856)** - There is one instance of a warped bonnet reported for this part in Fiscal Period P02.

2. **Ball Valve (Part Number 39B1451X032)** - There is one instance of a part that failed a vacuum test at 15 scfh reported for this part in Fiscal Period P07.

3. **Bonnet Assembly (Part Number GE48548X012)** - There is one instance where the bonnet did not have split pin holes on the sides, which could affect its assembly and function, reported in Fiscal Period P07.

4. **Gasket (Part Number 75B1124X042)** - There is one instance where the gasket was ripped upon arrival, potentially compromising its integrity, reported in Fiscal Period P04.

It's important to note that while there are instances of other parts with single nonconformances, these parts (Extension Bonnet, Ball

In [46]:
#answer = ollama.generate(model='wizardlm2', prompt=envelope)
def find_non_conformances_faiss(topics):
    results = {}

    for topic in topics:
        question = f"{topic}"

        # Embed Question using Ollama
        query_embedding = ollama.embed(model='wizardlm2', input=[question])

        # Semantic Search in LanceDB
        result = vectorizedncs_tbl.search(query_embedding['embeddings'][0]).limit(30).to_list()
        
        for r in result:
            print(r["nc"])

        # Extract non-conformance contexts
        context = [r["nc"] for r in result]
        results[topic] = context

    return {"results": results}
topics = ['Sherman']
def find_non_conformances(topics):
    results = {}

    for topic in topics:
        question = f"{topic}"
        threshold = 0.5
        # Embed Question using Ollama
        query_embedding = ollama.embed(model='wizardlm2', input=[question])

        # Semantic Search in LanceDB
        #result = vectorizedncs_tbl.search(query_embedding['embeddings'][0], query_type="hybrid").limit(30).to_list()
        #result = vectorizedncs_tbl.search(topic, query_type="hybrid").limit(30).to_list()
        results = vectorizedncs_tbl.search(query_type="hybrid").vector(query_embedding['embeddings'][0]).text(topic).limit(5).to_pandas()
        #filtered_results = [result for result in results if result['score'] >= threshold]

        # Check if there are any results left
        # if not filtered_results:
        #     print("No results found with sufficient similarity.")
        # else:
        #     for result in filtered_results:
        #         print(f"ID: {result['id']}, Score: {result['score']}")
        # # for r in result:
        # #     print(r["nc"])
        display(result)
        # Extract non-conformance contexts
        # context = [r["nc"] for r in result]
        # results[topic] = context

    return {"results": result}
s = find_non_conformances(topics=topics)
s
#print((s))


Pandas(Index=0, nc="SupplierName: INSPECTECH CORP, NonconformanceNumber: NC000475110, Disposition: SCRAP, FiscalPeriod: P02, DispositionQty: 1, PartNumber: N902882Q32972, PartName: BONNET, NonconformanceDescription: film for rt was received from inspectech on july 31 2023 and was rejected job orders since oracle have all the reviews occurring at the close of the order instead of real time so we were not made aware of this fact until the film was reviewed on oct 18 2023 the result was losing 2 ½ months of recovery time r westberg, PONumber: 4290000937, Pattern: , UOM: , NONCONFORMANCE_SOURCE: SUPPLIER, SiteNo: Couldn't find SiteNo for: mansfield, SiteName: mansfield", vector=array([ 0.01265615,  0.01580922,  0.00835892, ..., -0.01512109,
        0.00893816, -0.01708273], dtype=float32), NonconformanceNumber='NC000475110', PartNumber='N902882Q32972', _5=1.5132081508636475)

{'results': Pandas(Index=0, nc="SupplierName: INSPECTECH CORP, NonconformanceNumber: NC000475110, Disposition: SCRAP, FiscalPeriod: P02, DispositionQty: 1, PartNumber: N902882Q32972, PartName: BONNET, NonconformanceDescription: film for rt was received from inspectech on july 31 2023 and was rejected job orders since oracle have all the reviews occurring at the close of the order instead of real time so we were not made aware of this fact until the film was reviewed on oct 18 2023 the result was losing 2 ½ months of recovery time r westberg, PONumber: 4290000937, Pattern: , UOM: , NONCONFORMANCE_SOURCE: SUPPLIER, SiteNo: Couldn't find SiteNo for: mansfield, SiteName: mansfield", vector=array([ 0.01265615,  0.01580922,  0.00835892, ..., -0.01512109,
         0.00893816, -0.01708273], dtype=float32), NonconformanceNumber='NC000475110', PartNumber='N902882Q32972', _5=1.5132081508636475)}

In [54]:
import ollama
topics = ['Sherman']
query_embedding = ollama.embed(model='llama3.2', input=[topics])
# Semantic Search in LanceDB
result = vectorizedncs_tbl.search(query_embedding['embeddings'][0], query_type="hybrid").limit(30).to_list()
#result = vectorizedncs_tbl.search(topic, query_type="hybrid").limit(30).to_list()
#results = vectorizedncs_tbl.search(query_embedding['embeddings'][0]).limit(5).to_pandas()
#filtered_results = [res

ResponseError: invalid input type

In [28]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Example data
texts = [
    "Weld defect on assembly",
    "Crack found during inspection",
    "Paint peeling off surface",
]
metadata = [
    {"NonconformanceNumber": "NC001", "PartNumber": "P123"},
    {"NonconformanceNumber": "NC002", "PartNumber": "P124"},
    {"NonconformanceNumber": "NC003", "PartNumber": "P125"},
]

# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts)

# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # Use L2 similarity
index.add(np.array(embeddings))

# Query FAISS with a search string
query_string = "weld issue"
query_vector = model.encode([query_string])
distances, indices = index.search(np.array(query_vector), k=3)

# # Fetch results
# results = [{"text": texts[i], "metadata": metadata[i], "distance": distances[0][idx]} 
#            for idx, i in enumerate(indices[0])]

print(results)

ModuleNotFoundError: No module named 'faiss'

In [55]:
pip install faiss

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Could not find a version that satisfies the requirement faiss (from versions: none)[0m
[31mERROR: No matching distribution found for faiss[0m
You should consider upgrading via the '/Users/admin/source/quality-nir-classification/quality-nir-classification-api/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [47]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# Data
topic = ['machine']
sentences = df['nc']
print(sentences)
# Step 1: Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
topic_embedding = model.encode(topic)  # Embedding for the topic
sentence_embeddings = model.encode(sentences)  # Embeddings for sentences

# Step 2: Compute similarity scores with the topic
similarity_scores = cosine_similarity(sentence_embeddings, topic_embedding).flatten()

# Step 3: Apply DBSCAN to cluster sentences
dbscan = DBSCAN(eps=0.5, min_samples=2, metric='cosine')
labels = dbscan.fit_predict(sentence_embeddings)

# Step 4: Combine sentences, clusters, and similarity scores
df = pd.DataFrame({
    'Sentence': sentences,
    'Cluster': labels,
    'Similarity_Score': similarity_scores
})

# Step 5: Rank sentences by similarity score
df = df.sort_values(by='Similarity_Score', ascending=False)

# Display results
print(df['Cluster'])

5301     0
19965    0
24800    0
12274    0
25824    0
        ..
18115    0
23421    0
18099    0
6158     0
18147    0
Name: Cluster, Length: 26017, dtype: int64


In [None]:
print()

[0]


In [27]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.9.3-cp39-cp39-macosx_11_0_arm64.whl (7.8 MB)
[K     |████████████████████████████████| 7.8 MB 5.7 MB/s eta 0:00:01
Collecting importlib-resources>=3.2.0
  Downloading importlib_resources-6.4.5-py3-none-any.whl (36 kB)
Collecting cycler>=0.10
  Downloading cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl (64 kB)
[K     |████████████████████████████████| 64 kB 8.3 MB/s  eta 0:00:01
[?25hCollecting contourpy>=1.0.1
  Downloading contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl (249 kB)
[K     |████████████████████████████████| 249 kB 10.4 MB/s eta 0:00:01
Collecting fonttools>=4.22.0
  Downloading fonttools-4.55.1-cp39-cp39-macosx_10_9_universal2.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 11.2 MB/s eta 0:00:01
[?25hCollecting pyparsing>=2.3.1
  Downloading pyparsing-3.2.0-py3-none-any.whl (106 kB)
[K     |█████████████████████████████

In [None]:
emplate = [
            {"Name": "The task being performed at the time of the incident was MOVING PARTS TO FLATS.", 
             "Task Risk": "Without considering the reported incident, the task of moving parts to flats in a manufacturing or industrial setting ...",
             "Task Risk Score":3,
             "Incident Specific Task Risk":"The risk increased due to the use of a rubber mat that insulated the part from the grounded table, which could have allowed for the buildup of static electricity that led to the shock incident",
             "Incident Specific Task Risk Score":5
             }
        ]

envelope = f"You are a friendly AI assistant who finds information for Safety Analysts, analyze the incident information {context} and provide a insightfull response to task or follow the instruction {question}, You are required to respond strictly in JSON format, use the following template {json.dumps(template)}. Do not include any explanation or text outside of the JSON structure."
messages = [
            {"role": "user", "content": envelope}
        ]
response = chatIM(messages,'wizardlm2')
print(response['content'])

                                         text  topic mapped_topic
0                      AI and ML are amazing.      1      Welding
1       Welding techniques require precision.      0           AI
2                       AI drives automation.      0           AI
3  Flower often taked about stems and fruits.      1      Welding
4  Robotics is a blend of engineering and AI.      0           AI
5           Manual welding is a skillful art.      1      Welding

Discovered Topics:
 {'Topic 0': ['ai', 'blend', 'robotics', 'engineering', 'welding', 'require', 'techniques', 'precision', 'automation', 'drives'], 'Topic 1': ['welding', 'taked', 'stems', 'fruits', 'flower', 'manual', 'skillful', 'art', 'ml', 'amazing']}


In [109]:
pip install -U scikit-learn

162894.95s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.3/30.3 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.5.2 scipy-1.13.1 threadpoolctl-3.5.0
Note: you may need to restart th

In [98]:
query_embedding['embeddings']

[[-0.007864972,
  0.0016135521,
  0.0063102017,
  -0.024968889,
  -0.003293668,
  -0.022480369,
  0.0079957405,
  0.0040222094,
  -0.010380972,
  -0.011101491,
  0.0056286617,
  -0.004760098,
  -0.009561998,
  -0.012893186,
  0.0055170003,
  -0.0073762746,
  0.0017356029,
  0.0044234134,
  0.023521222,
  0.011339542,
  0.010129749,
  -0.0070945155,
  0.0054051066,
  0.028759118,
  -0.0024181104,
  -0.007204299,
  0.0060696374,
  -0.010372871,
  0.0010533204,
  -0.00065989286,
  0.0032208099,
  -0.013104338,
  -0.0110666985,
  -0.0020607212,
  -0.01586714,
  0.0083377985,
  0.018017737,
  -0.003999748,
  -0.022346532,
  0.003625331,
  0.0024710556,
  -0.01402569,
  -0.007231898,
  -0.0019518626,
  -0.00772308,
  0.015775044,
  0.00043321765,
  -0.006389304,
  -0.027006213,
  0.009994584,
  -0.022792445,
  -0.0023876666,
  -0.007262112,
  -0.2176392,
  -0.012393333,
  -0.007211056,
  -0.00014601661,
  0.00996005,
  -0.0034116146,
  0.0029048338,
  0.009210053,
  0.009414062,
  -0.0018868

In [4]:
import lancedb
uri = "data/ncs-lancedb"
db = lancedb.connect(uri)
hitlfeedback_table = db.open_table("HITL")
print("LanceDB Table Schema:", hitlfeedback_table.schema)

LanceDB Table Schema: id: string
content: string
rating: int32
comment: string
timestamp: string
vector: list<item: float>
  child 0, item: float


In [9]:
arrow_table = hitlfeedback_table.to_arrow()
print(arrow_table.schema)
arrow_table = hitlfeedback_table.to_arrow()
display(arrow_table.to_pandas())
print("Schema:", arrow_table.schema)

id: string
content: string
rating: int32
comment: string
timestamp: string
vector: list<item: float>
  child 0, item: float


Unnamed: 0,id,content,rating,comment,timestamp,vector
0,eea0fbfb-ceb1-4c2e-ac3f-df95b5b60639,"SupplierName: , NonconformanceNumber: NC000503...",5,,2024-11-27T16:50:55.675596,[0.0]
1,44f3801c-c5ac-4c41-91f1-38c7d9460b62,"Based on the provided data, which includes var...",4,,2024-11-27T16:55:07.568944,[0.0]
2,2d2ca04c-a730-48d8-b82c-6462904b811f,"SupplierName: , NonconformanceNumber: NC000508...",4,,2024-11-27T17:15:53.232386,[0.0]
3,01259241-4db9-4a40-878c-3f9576dca8c5,"SupplierName: , NonconformanceNumber: NC000503...",5,,2024-11-27T17:36:53.099610,[0.0]


Schema: id: string
content: string
rating: int32
comment: string
timestamp: string
vector: list<item: float>
  child 0, item: float


In [10]:
# Convert PyArrow rows to dictionaries
feedback_list = []
for row in arrow_table:
    feedback_dict = {col: row[col].as_py() if hasattr(row[col], "as_py") else row[col] for col in arrow_table.column_names}
    feedback_list.append(feedback_dict)


TypeError: 'str' object cannot be interpreted as an integer