In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
import nest_asyncio
nest_asyncio.apply()

In [59]:
from typing import List
from pydantic import BaseModel
import os
import json
import csv

from llama_parse import LlamaParse
from llama_index.core.schema import Document
from llama_index.llms.openai import OpenAI
import nest_asyncio

nest_asyncio.apply()

In [94]:
## Parse the Form Filing File
def parse_file(file_path: str) -> List[Document]:
    llama_parse = LlamaParse(
        api_key=os.environ['LLAMA_CLOUD_API_KEY'],
        result_type='markdown',
    )
    result = llama_parse.load_data(
        file_path,
    )
    return result

In [95]:
documents = parse_file('data/Report_format_2.xlsx')

Started parsing the file under job_id 566ecfd2-f562-4af7-b6b9-b2a969b9caab


In [96]:
print(type(documents))
print(len(documents))

<class 'list'>
1


In [97]:
print(f'Doc length: {len(documents)}')
print(documents[0].text)

Doc length: 1
|Items to be Improved                  |Improvement Parameters|                     |                |
|--------------------------------------|----------------------|---------------------|----------------|
|                                      |True Cause            |Improvement Direction|Improvement Plan|
|Product master mold surface inspection|                      |                     |                |
|Product hardware inspection           |                      |                     |                |
|Product wear rod                      |                      |                     |                |



In [98]:
text = documents[0].text

In [148]:
## Structured Extraction
prompt = f"""
You are an AI assistant specializing in Industrial Engineering problem solving. You've been given an Excel spreadsheet containing items to be improved. \ 
Your task is to extract and structure this information in a clear, organized format.

The Excel sheet contains the following:
1. Issues to be improved (rows)
2. Improvement parameters (columns)
3. Various Improvement topic (sub-columns)

Input Excel data:
{text}

Please present the extracted and structured information in a clear, easy-to-read format.
"""

In [149]:
class ReportParameters(BaseModel):
    """Data model for IE problem solving analysis."""
    ItemsToBeImproved: List[str]
    ImprovementParameters: List[str]

In [150]:
llm = OpenAI(model='gpt-4o-mini')

In [151]:
from llama_index.core.llms import ChatMessage
sllm = llm.as_structured_llm(output_cls=ReportParameters)
input_msg = ChatMessage.from_str(prompt)

In [152]:
output = sllm.chat([input_msg])
output_obj = output.raw
output_obj

ReportParameters(ItemsToBeImproved=['Product master mold surface inspection', 'Product hardware inspection', 'Product wear rod'], ImprovementParameters=['True Cause', 'Improvement Direction', 'Improvement Plan'])

In [153]:
print(type(output_obj))
print(output_obj.ItemsToBeImproved)
print(output_obj.ImprovementParameters)

<class '__main__.ReportParameters'>
['Product master mold surface inspection', 'Product hardware inspection', 'Product wear rod']
['True Cause', 'Improvement Direction', 'Improvement Plan']


In [154]:
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
# pip install llama-index-indices-managed-llama-cloud
index = LlamaCloudIndex(
  name="template-2-2024-11-25", 
  project_name="Default",
  organization_id="2033a7fc-187e-48e4-a172-5079c4ee2bbf",
  api_key=os.environ['LLAMA_CLOUD_API_KEY']
)

In [155]:
query_engine = index.as_query_engine(
    dense_similarity_top_k=10,
    sparse_similarity_top_k=10,
    alpha=0.5,
    enable_reranking=True,
    rerank_top_n=5,
)

In [156]:
ItemsToBeImproved = output_obj.ItemsToBeImproved
ImprovementParameters = output_obj.ImprovementParameters

In [158]:
for item in ItemsToBeImproved:
    print(item)
for paramter in ImprovementParameters:
    print(paramter)


Product master mold surface inspection
Product hardware inspection
Product wear rod
True Cause
Improvement Direction
Improvement Plan


In [None]:
from tqdm import tqdm
items_to_be_improved = {}
for parameter in ImprovementParameters:
    items_to_be_improved[parameter] = {}
    for item in tqdm(ItemsToBeImproved):
        query = f"What is the {parameter} for {item}? If you don't know the answer then say 'NA'"
        # answer = str(query_engine.query(query))
items_to_be_improved

{'True Cause': {}, 'Improvement Direction': {}, 'Improvement Plan': {}}

In [161]:
from tqdm import tqdm
def generate_answers(ItemsToBeImproved: List[str], ImprovementParameters: List[str]) -> List[str]:
    items_to_be_improved = {}
    for parameter in ImprovementParameters:
        items_to_be_improved[parameter] = {}
        for problem in tqdm(ItemsToBeImproved):
            query = f"What is the {parameter} for {problem}? If you don't know the answer then say 'NA'"
            answer = str(query_engine.query(query))
            items_to_be_improved[parameter][problem] = answer
    return items_to_be_improved

In [162]:
# ItemsToBeImproved = output_obj.ItemsToBeImproved
# ImprovementParameters = output_obj.TrueCause

In [163]:
answers = generate_answers(ItemsToBeImproved, ImprovementParameters)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.21s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.42s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.68s/it]


In [164]:
answers

{'True Cause': {'Product master mold surface inspection': "Product master mold surface inspection's True Cause is Product discoloration.",
  'Product hardware inspection': 'Visual fatigue during inspection',
  'Product wear rod': 'Visual fatigue during inspection'},
 'Improvement Direction': {'Product master mold surface inspection': 'Optimize inspection method.',
  'Product hardware inspection': 'Cancel discoloration defect check, Cancel unsaturated mold defect check, Check all products before threading the rod, After the product is inserted into the pole, the entire product is inspected together.',
  'Product wear rod': 'Add a jig to replace manual alignment and threading of rods.'},
 'Improvement Plan': {'Product master mold surface inspection': 'Reduce the number of checks and frequency, cancel discoloration defect check and unsaturated mold defect check, optimize inspection method by checking all products before threading the rod and inspecting the entire product after it is inser

In [170]:
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)
flat_data = []
for parameter, metrics in answers.items():
    flat_metrics = flatten_dict(metrics)
    flat_metrics['parameter'] = parameter 
    flat_data.append(flat_metrics)

In [171]:
flat_data

[{'Product master mold surface inspection': "Product master mold surface inspection's True Cause is Product discoloration.",
  'Product hardware inspection': 'Visual fatigue during inspection',
  'Product wear rod': 'Visual fatigue during inspection',
  'parameter': 'True Cause'},
 {'Product master mold surface inspection': 'Optimize inspection method.',
  'Product hardware inspection': 'Cancel discoloration defect check, Cancel unsaturated mold defect check, Check all products before threading the rod, After the product is inserted into the pole, the entire product is inspected together.',
  'Product wear rod': 'Add a jig to replace manual alignment and threading of rods.',
  'parameter': 'Improvement Direction'},
 {'Product master mold surface inspection': 'Reduce the number of checks and frequency, cancel discoloration defect check and unsaturated mold defect check, optimize inspection method by checking all products before threading the rod and inspecting the entire product after i

In [56]:
# answers = json.loads("""
# {'The saturation of the lower glass point Xiaoli Pill is 65%': 'Judgment of value and reduce tasks without added value, Inspection of movement quality and human engineering hazards: Reduce the number of movements, work with both hands at the same time, shorten the distance of movements, and make movements easier; eliminate human engineering hazards, Automated level inspection: simple and automated import, Merge and rearrange new job elements.',
#  'Bottom glass electrophoresis tank+UVFixed baking operation saturation 79.2%': 'The improvement direction for Bottom glass electrophoresis tank+UVFixed baking operation saturation 79.2% is to reduce tasks without added value, inspect movement quality and human engineering hazards, conduct automated level inspection, and merge and rearrange new job elements.',
#  'Xiaoliwan wax+Paste conductive foam+Lower glass glue frame dispensing operation saturation75%': 'The improvement direction for Xiaoliwan wax+Paste conductive foam+Lower glass glue frame dispensing operation saturation75% is to reduce the number of movements, work with both hands at the same time, shorten the distance of movements, and make movements easier; eliminate human engineering hazards.'}
# """)

In [177]:
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Flatten the nested dictionary
flat_data = []
for parameter, metrics in answers.items():
    flat_metrics = flatten_dict(metrics)
    flat_metrics['parameter'] = parameter 
    flat_data.append(flat_metrics)

# Get all unique keys to use as CSV headers
headers = set()
for item in flat_data:
    headers.update(item.keys())

# Sort headers to ensure 'Items to be improved' comes first
headers = sorted(headers)
headers.insert(0, headers.pop(headers.index('parameter')))
# headers.insert(1, headers.pop(headers.index('Improvement Parameters')))

# Write to CSV
with open('Report_format_2.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()
    for row in flat_data:
        writer.writerow(row)

In [179]:
import pandas as pd
from IPython.core.display import HTML

pd.set_option('display.max_colwidth', 10)
out_df = pd.read_csv("Report_format_2.csv")
html = out_df.to_html()
HTML(html)

Unnamed: 0,parameter,Product hardware inspection,Product master mold surface inspection,Product wear rod
0,True Cause,Visual fatigue during inspection,Product master mold surface inspection's True Cause is Product discoloration.,Visual fatigue during inspection
1,Improvement Direction,"Cancel discoloration defect check, Cancel unsaturated mold defect check, Check all products before threading the rod, After the product is inserted into the pole, the entire product is inspected together.",Optimize inspection method.,Add a jig to replace manual alignment and threading of rods.
2,Improvement Plan,,"Reduce the number of checks and frequency, cancel discoloration defect check and unsaturated mold defect check, optimize inspection method by checking all products before threading the rod and inspecting the entire product after it is inserted into the pole.",Check all products before threading the rod. Add a jig to replace manual alignment and threading of rods.


In [58]:
out_df.to_csv("Report_format_complete.csv", index=False)