In [50]:
from dotenv import load_dotenv
load_dotenv()

True

In [51]:
import nest_asyncio
nest_asyncio.apply()

In [52]:
from typing import List
from pydantic import BaseModel
import os
import json
import csv

from llama_parse import LlamaParse
from llama_index.core.schema import Document
from llama_index.llms.openai import OpenAI
import nest_asyncio

nest_asyncio.apply()

In [53]:
## Parse the Form Filing File
def parse_file(file_path: str) -> List[Document]:
    llama_parse = LlamaParse(
        api_key=os.environ['LLAMA_CLOUD_API_KEY'],
        result_type='markdown',
        target_pages="0"
    )
    result = llama_parse.load_data(
        file_path,
    )
    return result

In [54]:
documents = parse_file('data/Report_format_2.xlsx')

Started parsing the file under job_id 49b0f4f9-bb8a-441f-9e23-c301d2672e71


In [55]:
print(f'Doc length: {type(documents)}')
print(len(documents))

Doc length: <class 'list'>
1


In [56]:
print(f'Doc length: {len(documents)}')
print(documents[0].text)

Doc length: 1
|Items to be improved|Description|Improvement Parameters| | | | | | | | |
|---|---|---|---|---|---|---|---|---|---|---|
| | |Improvement direction|Person responsible|Expected start date|Actual start date|Expected completion date|Actual completion date|Improve immediately|Confirmation|appendix|
|The saturation of the lower glass point Xiaoli Pill is 65%|Low job saturation(lower than95%)| | | | | | | | | |
|Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%|Low job saturation(lower than95%)| | | | | | | | | |



In [57]:
text = documents[0].text

In [58]:
## Structured Extraction
prompt = f"""
You are an AI assistant specializing in Industrial Engineering problem solving. You've been given an Excel spreadsheet containing items to be improved. \ 
and improvement parameters. Your task is to extract and structure this information in a clear, organized format.

The Excel sheet contains the following:
1. Items to be improved (rows)
2. Description (columns)
3. Improvement parameters and dates (columns)

Input Excel data:
{text}

Please present the extracted and structured information in a clear, easy-to-read format.
"""


In [14]:
class ReportParameters(BaseModel):
    """Data model for IE problem solving analysis."""
    ItemsToBeImproved: List[str]
    Description: List[str]
    ImprovementParameters: List[str]
    # ImprovementPerson: List[str]
    # StartDate: List[str]
    # CompletionDate: List[str]
    # ImproveImmediately: List[bool]
    # Confirmation: List[str]
    # Appendix: List[str]

In [60]:
from llama_index.core.llms import ChatMessage
llm = OpenAI(model='gpt-4o-mini')
input_msg = ChatMessage.from_str('What is your name?')
output = llm.chat([input_msg])
output.message.content

'I’m called ChatGPT. How can I assist you today?'

In [15]:
from llama_index.core.llms import ChatMessage
sllm = llm.as_structured_llm(output_cls=ReportParameters)
input_msg = ChatMessage.from_str(prompt)

In [16]:
output = sllm.chat([input_msg])
output_obj = output.raw
output_obj

ReportParameters(ItemsToBeImproved=['The saturation of the lower glass point Xiaoli Pill is 65%', 'Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%'], Description=['Low job saturation(lower than95%)', 'Low job saturation(lower than95%)'], ImprovementParameters=['Improvement direction', 'Person responsible', 'Expected start date', 'Actual start date', 'Expected completion date', 'Actual completion date', 'Improve immediately', 'Confirmation', 'appendix'])

In [64]:
#print out the items based on the data model
print(len(output_obj.ItemsToBeImproved))
print(len(output_obj.Description))
print(len(output_obj.ImprovementParameters))
output_obj.ImprovementParameters

NameError: name 'output_obj' is not defined

In [18]:
ItemsToBeImproved = output_obj.ItemsToBeImproved
Description = output_obj.Description
ImprovementParameters = output_obj.ImprovementParameters[1:]
# print(output_obj.ImprovementDirection)
# print(output_obj.BriefDescription)
# print(output_obj.ImprovementPerson)
# print(output_obj.StartDate)
# print(output_obj.CompletionDate)
# print(output_obj.ImproveImmediately)
# print(output_obj.Confirmation)
# print(output_obj.Appendix)

In [28]:
ItemsToBeImproved
Description
ImprovementParameters

['Person responsible',
 'Expected start date',
 'Actual start date',
 'Expected completion date',
 'Actual completion date',
 'Improve immediately',
 'Confirmation',
 'appendix']

In [65]:
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
# pip install llama-index-indices-managed-llama-cloud
index = LlamaCloudIndex(
  name="objective-wildfowl-2024-11-24", 
  project_name="Default",
  organization_id="2033a7fc-187e-48e4-a172-5079c4ee2bbf",
  api_key=os.environ['LLAMA_CLOUD_API_KEY']
)

In [66]:
query_engine = index.as_query_engine(
    dense_similarity_top_k=10,
    sparse_similarity_top_k=10,
    alpha=0.5,
    enable_reranking=True,
    rerank_top_n=5,
)

In [None]:
# print(output_obj.ItemsToBeImproved)
# print(output_obj.BriefDescription)
# print(output_obj.ImprovementDirection)
# print(output_obj.ImprovementPerson)
# print(output_obj.StartDate)
# print(output_obj.CompletionDate)
# print(output_obj.ImproveImmediately)
# print(output_obj.Confirmation)
# print(output_obj.Appendix)

In [29]:
# ItemsToBeImproved = output_obj.ItemsToBeImproved
# BriefDescription = output_obj.BriefDescription
# ImprovementDirection = output_obj.ImprovementDirection
# ImprovementPerson = output_obj.ImprovementPerson
# StartDate = output_obj.StartDate
# CompletionDate = output_obj.CompletionDate
# ImproveImmediately = output_obj.ImproveImmediately
# Confirmation = output_obj.Confirmation
# Appendix = output_obj.Appendix
ItemsToBeImproved

['The saturation of the lower glass point Xiaoli Pill is 65%',
 'Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%']

In [None]:
['The saturation of the lower glass point Xiaoli Pill is 65%',
 'Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%']

In [None]:
# sllm.chat(['What is your name?'])
Description = ['Low job saturation(higher than95%)',
 'Low job saturation(higher than99%)',
 'Low job saturation(lower than90%)']
Description

In [None]:
# sllm.chat(['What is your name?'])
Description = ['Low job saturation(higher than95%)',
 'Low job saturation(higher than99%)',
 'Low job saturation(lower than90%)']

In [None]:
#Generate query and test if it generating them correctly
from tqdm import tqdm
items_to_be_improved = {}

for index, item in enumerate(ItemsToBeImproved[:3]):
    text_info = Description[index]
    job_saturation = (
    f"Analyze the following text and determine whether the job saturation value mentioned is lower than 95%.\n\n"
    f"Text: \"{text_info}\"\n\n"
    f"If a job saturation value is explicitly mentioned, check if it is lower than 95%. If so, respond with 'YES'. "
    f"If it is 95% or higher, respond with 'NO'. If no job saturation value is mentioned, respond with 'NO INFORMATION'."
)
    input_msg = ChatMessage.from_str(job_saturation)
    output = llm.chat([input_msg])
    answer = output.message.content
    
    print(index,',',text_info,',',answer)
    
    # for (index, parameter) in enumerate(ImprovementParameters):
    #     des = Description[index]
    #     job_saturation = f"Is the job saturation value lower than 95% for:'{des}'? Your response should be boolean. Either YES or NO?"
    #     # query = f"What is the '{parameter}' for '{item}'? If you don't know the answer then say 'NA'"
    #     answer = str(query_engine.query(job_saturation))
    #     # items_to_be_improved[item][parameter] = answer
    #     print(des,',',answer)
    #     index +=1

In [None]:
    ItemsToBeImproved: List[str]
    Description: List[str]
    ImprovementParameters: List[str]

In [None]:
# Your data
data_by_row = [
    {
        'Items to be improved': 'Item one',
        'Description': 'one des',
        'Improvement direction': '',
        'Person responsible': '',
        'Expected start date': '',
        'Actual start date': '',
        'Expected completion date': '',
        'Actual completion date': '',
        'Improve immediately': '',
        'Confirmation': '',
        'appendix': ''
    },
    {
        'Items to be improved': 'Item two',
        'Description': 'two des',
        'Improvement direction': None,
        'Person responsible': None,
        'Expected start date': None,
        'Actual start date': None,
        'Expected completion date': None,
        'Actual completion date': None,
        'Improve immediately': None,
        'Confirmation': None,
        'appendix': None
    }
]
# Extract 'Items to be improved' as a list
ItemsToBeImproved = [entry['Items to be improved'] for entry in data_by_row]
Description = [entry['Description'] for entry in data_by_row]
ImprovementParameters = list(data_by_row[0].keys())[2:]
# new_improvement_directions = generate_answers(ItemsToBeImproved, Description, ImprovementParameters)
print(ItemsToBeImproved, Description, ImprovementParameters)

['Item one', 'Item two'] ['one des', 'two des'] ['Improvement direction', 'Person responsible', 'Expected start date', 'Actual start date', 'Expected completion date', 'Actual completion date', 'Improve immediately', 'Confirmation', 'appendix']


In [12]:
def input_to_ai(data_by_row):
    ItemsToBeImproved = [entry['Items to be improved'] for entry in data_by_row]
    Description = [entry['Description'] for entry in data_by_row]
    ImprovementParameters = [list(data_by_row[0].keys())[2]]
    return ItemsToBeImproved, Description, ImprovementParameters
ItemsToBeImproved, Description, ImprovementParameters = input_to_ai(data_by_row)
print(ItemsToBeImproved, Description, ImprovementParameters)


['Item one', 'Item two'] ['one des', 'two des'] ['Improvement direction']


In [10]:
ImprovementParameters = [list(data_by_row[0].keys())[2]]
ImprovementParameters

['Improvement direction']

In [None]:
def create_report_output_content(data_by_row, report_file_name: str, ai_responses: List[str]) -> List[dict]:
    # Update the dictionaries
    for i, entry in enumerate(data_by_row):
        if i < len(ai_responses):
            entry['Improvement direction'] = ai_responses[i]
    return data_by_row

dhon = create_report_output_content(data_by_row, report_file_name: str, ai_responses: List[str]) -> List[dict]:
dhon

[{'Items to be improved': 'Item one',
  'Description': 'one des',
  'Improvement direction': 'New Direction 1',
  'Person responsible': '',
  'Expected start date': '',
  'Actual start date': '',
  'Expected completion date': '',
  'Actual completion date': '',
  'Improve immediately': '',
  'Confirmation': '',
  'appendix': ''},
 {'Items to be improved': 'Item two',
  'Description': 'two des',
  'Improvement direction': 'New Direction 2',
  'Person responsible': None,
  'Expected start date': None,
  'Actual start date': None,
  'Expected completion date': None,
  'Actual completion date': None,
  'Improve immediately': None,
  'Confirmation': None,
  'appendix': None}]

In [None]:
text_info = 'My country name is Bangladsh. I do not know the value of job saturation at this point of time!'
# text_info = 'low job saturation (lower than 99%)'
# text_info = 'Simultaneous working time of man and machine = manual time (39S) + automatic time (2S) - process standard C/T (41S) = 0S Time of man waiting for machine = automatic time (2S) - simultaneous working time of man and machine (0S)'
job_saturation = (
    f"Analyze the following text and determine whether the job saturation value mentioned is lower than 95%.\n\n"
    f"Text: \"{text_info}\"\n\n"
    f"If a job saturation value is explicitly mentioned, check if it is lower than 95%. If so, respond with 'YES'. "
    f"If it is 95% or higher, respond with 'NO'. If no job saturation value is mentioned, respond with 'NO INFORMATION'."
)
input_msg = ChatMessage.from_str(job_saturation)
output = llm.chat([input_msg])
answer = output.message.content

print(answer)

In [54]:
parameter = ['india', 'pakistan', 'bangaldesh']
game = ['cricket', 'football', 'bangaldesh']
items_to_be_improved = {}
for index, item in enumerate(ItemsToBeImproved):
    items_to_be_improved[item] = {}
    for country in parameter:
        items_to_be_improved[item]['country'] = parameter[index]
        items_to_be_improved[item]['game'] = game[index]
print(items_to_be_improved)

{'The saturation of the lower glass point Xiaoli Pill is 65%': {'country': 'india', 'game': 'cricket'}, 'Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%': {'country': 'pakistan', 'game': 'football'}}


In [66]:
ImprovementParameters

['Param_1', 'Param_2', 'Param_3']

In [51]:
import tqdm
# ItemsToBeImproved: List[str], Description,ImprovementParameters: List[str]) -> List[str]:
items_to_be_improved = {}
for index, item in enumerate(ItemsToBeImproved):
    items_to_be_improved[item] = {}
    for parameter in tqdm(ImprovementParameters):
        query = (
            f"For the problem '{item}', provide a detailed and concise value or description for the improvement parameter '{parameter}'.\n"
            f"If the parameter is not applicable or no information is available, respond explicitly with 'NA'.\n\n"
            f"Ensure your response is clear, contextually relevant, and avoids ambiguity."
            )
        # answer = str(query_engine.query(query))
        # answer = str(query_engine.query(query))
        items_to_be_improved[item]['Description'] = Description[index]
        items_to_be_improved[item][parameter] = '1'
items_to_be_improved        

TypeError: 'module' object is not callable

In [None]:
job_saturation = (
    f"Analyze the following text and determine whether the job saturation value mentioned is lower than 95%.\n\n"
    f"Text: \"{text_info}\"\n\n"
    f"If a job saturation value is explicitly mentioned, check if it is lower than 95%. If so, respond with 'YES'. "
    f"If it is 95% or higher, respond with 'NO'. If no job saturation value is mentioned, respond with 'NO INFORMATION'."
)
input_msg = ChatMessage.from_str(job_saturation)
output = llm.chat([input_msg])
answer = output.message.content

In [None]:
Description = ['Low job saturation(higher than95%)',
 'Low job saturation(lower than95%)',
 'Low job saturation(lower than95%)']

In [None]:

[{'Items to be improved': 'Item one', 'Description': 'one des', 'Improvement direction': '', 'Person responsible': '', 
  'Expected start date': '', 'Actual start date': '', 'Expected completion date': '', 'Actual completion date': '', 
  'Improve immediately': '', 'Confirmation': '', 'appendix': ''}, 
  {'Items to be improved': 'Item two', 'Description': 'two des', 'Improvement direction': None, 'Person responsible': None, 'Expected start date': None, 'Actual start date': None, 'Expected completion date': None, 'Actual completion date': None, 'Improve immediately': None, 'Confirmation': None, 'appendix': None}]

In [None]:
from tqdm import tqdm
def generate_answers(ItemsToBeImproved: List[str], Description: List[str], ImprovementParameters: List[str]) -> List[str]:
    items_to_be_improved = {}
    for index, item in enumerate(ItemsToBeImproved):
        items_to_be_improved[item] = {}
        for parameter in tqdm(ImprovementParameters):
            job_saturation = (
                f"Analyze the following text and determine whether the job saturation value mentioned is lower than 95%.\n\n"
                f"Text: \"{Description[index]}\"\n\n"
                f"If a job saturation value is explicitly mentioned, check if it is lower than 95%. If so, respond with 'YES'. "
                f"If it is 95% or higher, respond with 'NO'. If no job saturation value is mentioned, respond with 'NO INFORMATION'."
            )
            input_msg = ChatMessage.from_str(job_saturation)
            output = llm.chat([input_msg])
            answer = output.message.content
            print(Description[index], answer)
            if answer == 'YES':
                query = (
                    f"For the problem '{item}', provide a detailed and concise value or description for the improvement parameter '{parameter}'.\n"
                    f"If the parameter is not applicable or no information is available, respond explicitly with 'NA'.\n\n"
                    f"Ensure your response is clear, contextually relevant, and avoids ambiguity."
                    )
                answer = str(query_engine.query(query))
                items_to_be_improved[item]['Description'] = Description[index]
                items_to_be_improved[item][parameter] = answer
    return items_to_be_improved

In [17]:
import random
word_list = ["apple", "banana", "cherry", "date", "elderberry", "fig", "grape", "honeydew", "kiwi", "lemon", "mango", "nectarine"]
def get_random_words(word_list, num_words=8):
    answer = random.sample(word_list, num_words)
    return " ".join(answer)
response_1 = get_random_words(word_list)
response_1

'honeydew kiwi nectarine grape banana mango fig date'

In [19]:
def input_to_ai(data_by_row):
    ItemsToBeImproved = [entry['Items to be improved'] for entry in data_by_row]
    Description = [entry['Description'] for entry in data_by_row]
    ImprovementParameters = [list(data_by_row[0].keys())[2]]
    return ItemsToBeImproved, Description, ImprovementParameters
ItemsToBeImproved, Description, ImprovementParameters = input_to_ai(data_by_row)
print(ItemsToBeImproved, Description, ImprovementParameters)

['Item one', 'Item two'] ['one des', 'two des'] ['Improvement direction']


In [None]:
from tqdm import tqdm
ItemsToBeImproved = ['Item_One', 'Item_Two']
Description = ['Des_1', 'Des_2']
ImprovementParameters = ['Param_1', 'Param_2']

def generate_answers(ItemsToBeImproved: List[str], Description: List[str], ImprovementParameters: List[str]) -> List[str]:
    for index, item in enumerate(ItemsToBeImproved):
        
        for parameter in ImprovementParameters:
            # job_saturation = (
            #     f"Analyze the following text and determine whether the job saturation value mentioned is lower than 95%.\n\n"
            #     f"Text: \"{Description[index]}\"\n\n"
            #     f"If a job saturation value is explicitly mentioned, check if it is lower than 95%. If so, respond with 'YES'. "
            #     f"If it is 95% or higher, respond with 'NO'. If no job saturation value is mentioned, respond with 'NO INFORMATION'."
            # )
            # input_msg = ChatMessage.from_str(job_saturation)
            # output = llm.chat([input_msg])
            answer = get_random_words(word_list)
            responses.append((item, Description[index], answer))

            # print(Description[index], answer)
            # if answer == 'YES':
            #     query = (
            #         f"For the problem '{item}', provide a detailed and concise value or description for the improvement parameter '{parameter}'.\n"
            #         f"If the parameter is not applicable or no information is available, respond explicitly with 'NA'.\n\n"
            #         f"Ensure your response is clear, contextually relevant, and avoids ambiguity."
            #         )
            #     answer = str(query_engine.query(query))
            # items_to_be_improved[item]['Description'] = Description[index]
            # items_to_be_improved[item][parameter] = answer[index]

    return responses

In [40]:
responses = generate_answers(ItemsToBeImproved, Description, ImprovementParameters)
responses

[('Item_Two', 'Des_2', 'honeydew fig mango lemon elderberry grape date apple'),
 ('Item_Two', 'Des_2', 'fig date grape kiwi lemon nectarine mango banana')]

In [None]:
from tqdm import tqdm
ItemsToBeImproved = ['The saturation of the lower glass point Xiaoli Pill is 65%',
 'Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%']

Description = ['Low job saturation(lower than95%)',
 'Low job saturation(lower than95%)']
ImprovementParameters = ['improvement direction']

def generate_answers(ItemsToBeImproved: List[str], Description: List[str], ImprovementParameters: List[str]) -> List[str]:
    responses = []
    for item, des in zip(ItemsToBeImproved, Description):
            job_saturation = (
                f"Analyze the following text and determine whether the job saturation value mentioned is lower than 95%.\n\n"
                f"Text: \"{des}\"\n\n"
                f"If a job saturation value is explicitly mentioned, check if it is lower than 95%. If so, respond with 'YES'. "
                f"If it is 95% or higher, respond with 'NO'. If no job saturation value is mentioned, respond with 'NO INFORMATION'."
            )
            input_msg = ChatMessage.from_str(job_saturation)
            output = llm.chat([input_msg])
            con_check = output.message.content
            if con_check == 'YES':
                for param in ImprovementParameters:
                    query = (
                    f"For the problem '{item}', provide a detailed and concise value or description for the improvement parameter '{param}'.\n"
                    f"If the parameter is not applicable or no information is available, respond explicitly with 'NA'.\n\n"
                    f"Ensure your response is clear, contextually relevant, and avoids ambiguity."
                    )
                    answer = str(query_engine.query(query))
                    responses.append(answer)
    return responses
responses = generate_answers(ItemsToBeImproved, Description, ImprovementParameters)
responses


[('The saturation of the lower glass point Xiaoli Pill is 65%',
  'Low job saturation(lower than95%)',
  'Judgment of value and reduce tasks without added value, Inspection of movement quality and human engineering hazards, Automated level inspection, Merge and rearrange new job elements.'),
 ('Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%',
  'Low job saturation(lower than95%)',
  "The improvement direction for the problem 'Bottom glass electrophoresis tank+UVFixed baking operation saturation79.2%' is to reduce the number of movements, work with both hands at the same time, shorten the distance of movements, and make movements easier; eliminate human engineering hazards.")]

In [None]:
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)
flat_data = []
for parameter, metrics in answers.items():
    flat_metrics = flatten_dict(metrics)
    flat_data.append(flat_metrics)

In [None]:
# flat_data

In [None]:
# answers = json.loads("""
# {'The saturation of the lower glass point Xiaoli Pill is 65%': 'Judgment of value and reduce tasks without added value, Inspection of movement quality and human engineering hazards: Reduce the number of movements, work with both hands at the same time, shorten the distance of movements, and make movements easier; eliminate human engineering hazards, Automated level inspection: simple and automated import, Merge and rearrange new job elements.',
#  'Bottom glass electrophoresis tank+UVFixed baking operation saturation 79.2%': 'The improvement direction for Bottom glass electrophoresis tank+UVFixed baking operation saturation 79.2% is to reduce tasks without added value, inspect movement quality and human engineering hazards, conduct automated level inspection, and merge and rearrange new job elements.',
#  'Xiaoliwan wax+Paste conductive foam+Lower glass glue frame dispensing operation saturation75%': 'The improvement direction for Xiaoliwan wax+Paste conductive foam+Lower glass glue frame dispensing operation saturation75% is to reduce the number of movements, work with both hands at the same time, shorten the distance of movements, and make movements easier; eliminate human engineering hazards.'}
# """)

In [None]:
flat_data

In [None]:
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Flatten the nested dictionary
flat_data = []
for parameter, metrics in answers.items():
    flat_metrics = flatten_dict(metrics)
    flat_metrics['Items to be improved'] = parameter 
    flat_data.append(flat_metrics)

# Get all unique keys to use as CSV headers
headers = set()
for item in flat_data:
    headers.update(item.keys())

# Sort headers to ensure 'Items to be improved' comes first
headers = sorted(headers)
headers.insert(0, headers.pop(headers.index('Items to be improved')))
headers.insert(1, headers.pop(headers.index('Description')))
headers.insert(2, headers.pop(headers.index('Improvement direction')))
headers.insert(3, headers.pop(headers.index('Person responsible')))
headers.insert(4, headers.pop(headers.index('Expected start date')))
headers.insert(5, headers.pop(headers.index('Actual start date')))
headers.insert(6, headers.pop(headers.index('Expected completion date')))
headers.insert(7, headers.pop(headers.index('Actual completion date')))
headers.insert(8, headers.pop(headers.index('Improve immediately')))
headers.insert(9, headers.pop(headers.index('Confirmation')))
headers.insert(10, headers.pop(headers.index('appendix')))

# Write to CSV
with open('Report_format_2_complete.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()
    for row in flat_data:
        writer.writerow(row)

In [None]:
import pandas as pd
from IPython.core.display import HTML
pd.set_option('display.max_colwidth', 10)
out_df = pd.read_csv("Report_format_2_complete.csv")
html = out_df.to_html()
HTML(html)

In [None]:
out_df.to_excel("Report_format_2_complete.xlsx", index=False)

In [None]:
#Creating new duration column for plotting the Gantt Chart

In [None]:
import os
import pandas as pd
file_path = '../llama_parse_al/Report_format_2_complete.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
df['duration'] = pd.to_datetime(df['Actual completion date']) - pd.to_datetime(df['Actual start date'])

In [None]:
df.head()

In [None]:
# Position a column to a specified place in pandas
columns = list(df.columns)
columns.remove('duration')
insert_at = columns.index('Actual completion date') + 1
columns.insert(insert_at, 'duration')
df = df[columns]

In [None]:
# Save the excel file after formatting is done
df.to_excel('Report_format_2_complete_input.xlsx', index=False)

In [None]:
df.head()

In [None]:
import os
file_path = '../llama_parse_al/Report_format_2_complete_input.xlsx'
os.path.exists(file_path)

In [None]:
from openpyxl import load_workbook
from openpyxl.chart import BarChart, Reference
from openpyxl.chart.label import DataLabelList
# Step 1: Load the Excel file
# file_path = "data.xlsx"  # Replace with your Excel file path
workbook = load_workbook(file_path)
sheet = workbook.active  # Use the active sheet (or specify by name: workbook['SheetName'])

# Step 2: Identify the data range (e.g., A1:B5)
# Assuming the first column has categories (e.g., tasks) and the second has values (e.g., counts)
categories = Reference(sheet, min_col=1, min_row=2, max_row=sheet.max_row)  # Task names
values = Reference(sheet, min_col=9, min_row=1, max_row=sheet.max_row)  # Values including header

# Step 3: Create a horizontal bar chart
chart = BarChart()
chart.type = "bar"  # Horizontal bar chart
chart.title = "Timeline Chart"
chart.y_axis.title = "Days"
chart.x_axis.title = "Tasks"
# Add data and categories to the chart
chart.add_data(values, titles_from_data=True)
chart.set_categories(categories)
# Step 5: Add data labels
data_labels = DataLabelList()  # Create a DataLabelList object
data_labels.showVal = True  # Show values on the bars
data_labels.position = 'inBase'
chart.dLbls = data_labels
chart.legend = None

# Step 4: Add the chart to the worksheet
sheet.add_chart(chart, "M2")  # Place the chart in column D, row 2

# Step 5: Save the updated Excel file
workbook.save("updated_data.xlsx")
print("Chart added to 'updated_data.xlsx'")

In [None]:
chart.dLbls

In [None]:
from openpyxl import load_workbook
workbook = load_workbook("Report_format_2_complete.xlsx")

In [None]:
# Select the active worksheet (or specify the sheet by name)
sheet = workbook.active  # or workbook["SheetName"]

# Insert a new row at the beginning
sheet.insert_rows(1)  # Insert an empty row at the top

# Merge two cells (e.g., A1 and B1)
sheet.merge_cells("E1:F1")

# Write data into the merged cells
sheet["E1"] = "Date"

# Save the workbook
workbook.save("example.xlsx")

print("New row added at the beginning!")


In [None]:
# Try improving the graph to make an interactive Gantt Chart
## Final version of the graph of Gantt Chart

import pandas as pd
import plotly.express as px
from openpyxl import load_workbook
from openpyxl.drawing.image import Image

# Step 1: Read Excel file
file_path = './data/tasks.xlsx'
df = pd.read_excel(file_path)

# Step 2: Ensure columns are in the correct format
df['Start'] = pd.to_datetime(df['Start'])
df['Finish'] = pd.to_datetime(df['Finish'])

# Step 3: Add a column for data labels (e.g., Task names or durations)
df['Label'] =  (df['Finish'] - df['Start']).dt.days.astype(str)
# df['Label'] = df['Task'] + " (" + ((df['Finish'] - df['Start']).dt.days.astype(str)) + " days)"

# Step 4: Create a Gantt chart with data labels
fig = px.timeline(
    df,
    x_start="Start",
    x_end="Finish",
    y="Task",
    color="Resource",
    text="Label",  # Overlay Task names and durations
    title="Gantt Chart with Custom X-Axis"
)

# Step 5: Customize the x-axis
fig.update_traces(textposition="auto", textfont=dict(size=12, color="white"))

fig.update_yaxes(categoryorder="trace")  # Optional: Sort tasks by total ascending
fig.update_layout(
    xaxis_title="Project Timeline (Dates)",  # Custom x-axis label
    title_x=0.5  # Center the chart title
)

# Step 6: Save the chart as an image
image_file = "gantt_chart_with_custom_xaxis.png"
fig.write_image(image_file)  # Requires kaleido library (install using `pip install kaleido`)

# Step 7: Load the Excel file
wb = load_workbook(file_path)
ws = wb.active  # Use the first sheet (or specify a sheet name)

# Step 8: Insert the image into the Excel file
img = Image(image_file)
img.anchor = "E2"  # Position to embed the image (e.g., cell E2)
ws.add_image(img)

# Step 9: Save the updated Excel file
output_file = "tasks_with_custom_xaxis.xlsx"
wb.save(output_file)

print(f"Gantt chart with custom x-axis embedded in {output_file}")

Gantt chart with custom x-axis embedded in tasks_with_custom_xaxis.xlsx
