In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.5-cp310-none-manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.5 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.5 PyMuPDFb-1.23.5


# **Images Extraction From PDF File (PMI)**

In [None]:
import fitz
import os

def extract_images(pdf_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    image_counter = 1

    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_data = base_image["image"]

            image_filename = os.path.join(output_folder, f"image_{image_counter}.png")
            with open(image_filename, "wb") as image_file:
                image_file.write(image_data)

            image_counter += 1

    pdf_document.close()

pdf_path = "/content/practice-standard-project-risk-management.pdf"
output_folder = "output_images"
extract_images(pdf_path, output_folder)


# **Extract Figure Titles**



In [None]:
import re

def extract_figures_from_txt(txt_file_path):
    figures = []

    with open(txt_file_path, 'r') as txt_file:
        current_figure = ""
        for line in txt_file:
            # Remove the newline character at the end of the line
            line = line.rstrip('\n')

            # Check if the line starts with "Figure" or has leading whitespace and "Figure"
            if re.match(r'^\s*Figure+', line) and "." in line:
                # If we have a current figure, add it to the list
                if current_figure:
                    figures.append(current_figure)
                current_figure = line

        # Add the last figure if it exists
        if current_figure:
            figures.append(current_figure)

    return figures

txt_file_path = '/content/pmi_chapters_cleaned.txt'
extracted_figures = extract_figures_from_txt(txt_file_path)

# Output file path for the extracted figures
output_file_path = '/content/extracted_figures.txt'

# Write the extracted figures to a new file
with open(output_file_path, 'w') as output_file:
    for figure in extracted_figures:
        output_file.write(figure + '\n')

print(f"Extracted figures have been saved to {output_file_path}")


Extracted figures have been saved to /content/extracted_figures.txt


# **Extract Table Titles**

In [None]:
import re

# Regular expressions for lines starting with "Figure" and "Table"
figure_pattern = r'^\s*Figure .*$'
table_pattern = '^\s*[Tt][Aa][Bb][Ll][Ee] .*$'

figures_and_tables = []

with open('/content/pmi_appendix_cleaned.txt', 'r') as file:
    for line in file:
        if not re.search(r'\(continued\)', line) and (re.match(figure_pattern, line) or re.match(table_pattern, line)):
            figures_and_tables.append(line.strip())

# Now, the 'figures_and_tables' list contains lines starting with "Figure" or "Table"
# that do not contain the word "(continued)"
figures_and_tables

# Writing the contents of figures_and_tables to a text file
output_file_path = '/content/figures_and_tables.txt'

with open(output_file_path, 'w') as output_file:
    for line in figures_and_tables:
        output_file.write(line + '\n')

print(f"Contents of 'figures_and_tables' have been written to {output_file_path}")



Contents of 'figures_and_tables' have been written to /content/figures_and_tables.txt


In [None]:
file_path = '/content/figures_and_tables.txt'  # Replace with your file path

# Text to be added to the file
text_to_add = "TABLE D4.\nTABLE D5."

# Open the file in append mode and write the text
with open(file_path, 'a') as file:
    file.write(text_to_add)  # Adding a newline before the new content

print("Text added to the file.")

Text added to the file.


# **Order List **

In [None]:
# Read the content from figures_and_tables.txt
file_path = '/content/figures_and_tables.txt'

with open(file_path, 'r') as file:
    lines = file.readlines()

# Separate figures and tables based on 'Figure' and 'Table'
figures = [line.strip() for line in lines if line.lower().lstrip().startswith('figure')]
tables = [line.strip() for line in lines if line.lower().lstrip().startswith('table')]

# Define a function to extract the numeric part from the figure/table names
def extract_number(text):
    # Check for a pattern like 'D' followed by numbers
    match = re.search(r'\b[Dd]\d+\b', text)
    return int(match.group(0)[1:]) if match else float('inf')

# Sort figures and tables based on their numeric part
sorted_figures = sorted(figures, key=extract_number)
sorted_tables = sorted(tables, key=extract_number)

# Combine figures followed by tables
ordered_list = sorted_figures + sorted_tables

# Write the ordered content to a new file
output_file_path = '/content/ordered_figures_and_tables.txt'

with open(output_file_path, 'w') as output_file:
    for line in ordered_list:
        output_file.write(line + '\n')

print(f"Ordered list of figures and tables has been written to {output_file_path}")


Ordered list of figures and tables has been written to /content/ordered_figures_and_tables.txt


# **Merge**

In [None]:
# Define the file paths
figures_file_path = '/content/extracted_figures.txt'
ordered_file_path = '/content/ordered_figures_and_tables.txt'
merged_file_path = '/content/merged_figures_and_tables.txt'

# Read contents of the figures file
with open(figures_file_path, 'r') as figures_file:
    figures_content = figures_file.read()

# Read contents of the ordered figures and tables file
with open(ordered_file_path, 'r') as ordered_file:
    ordered_content = ordered_file.read()

# Merge figures and ordered content
merged_content = figures_content + ordered_content

# Write the merged content to a new file
with open(merged_file_path, 'w') as merged_file:
    merged_file.write(merged_content)



print(f"Merged figures and tables have been saved to {merged_file_path}")


Merged figures and tables have been saved to /content/merged_figures_and_tables.txt


# *Renaming*

In [None]:
import os

# Define the paths to the folders and the text file
image_folder = "/content/output_images"  # Replace with your image folder path
text_file_path = "/content/merged_figures_and_tables.txt"  # Replace with your text file path

# Read the lines from the text file
with open(text_file_path, "r") as text_file:
    lines = [line.strip() for line in text_file.readlines()]

# Ensure that the number of lines matches the number of files in the folder
files_in_folder = sorted(os.listdir(image_folder), key=lambda x: int(os.path.splitext(x)[0].split('.')[0]))
if len(lines) != len(files_in_folder):
    print("Error: The number of lines in the text file does not match the number of files in the folder.")
    print("Renaming process cannot be completed.")
else:
    # Perform the renaming process using the lines from the text file
    for i, image_file_name in enumerate(files_in_folder, start=1):
        new_image_file_name = f"{lines[i - 1]}.png"
        os.rename(os.path.join(image_folder, image_file_name),
                  os.path.join(image_folder, new_image_file_name))
        print(f"Renamed {image_file_name} to {new_image_file_name}")

    print("Renaming completed.")


Renamed 1.png to Figure 1-1. Hierarchy of PMI Project Risk Management Resources.png
Renamed 2.png to Figure 1-2. Critical Success Factors for Project Risk Management.png
Renamed 3.png to Figure 3-1. Project Risk Management Process Flow Diagram.png
Renamed 4.png to Figure 4-1. Key Areas of Focus for the Plan Risk Management Process.png
Renamed 5.png to Figure 5-1. Three Perspectives of Risk Identification.png
Renamed 6.png to Figure 5-2. Cause, Risk, and Effect.png
Renamed 7.png to Figure 6 -1. Building Risk Analysis Credibility.png
Renamed 8.png to Figure 6-2. The Perform Qualitative Risk Analysis Process.png
Renamed 9.png to Figure 7-1. Comparison of Qualitative and Quantitative Approaches.png
Renamed 10.png to Figure 7-2. Structure of a Quantitative Risk Analysis.png
Renamed 11.png to Figure 8-1. Critical Success Factors for Risk Response Planning.png
Renamed 12.png to Figure 8-2. The Steps Involved in Planning Risk Responses.png
Renamed 13.png to Figure 9-1. Schematic Representation

# **In Case of errors **

In [None]:
import os

# Directory path where your images are stored
directory_path = '/content/output_images'  # Replace with your directory path

# List all files in the directory
files = os.listdir(directory_path)

# Count the number of image files (assumes image extensions like .jpg, .png, .jpeg)
image_count = sum(1 for file in files if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')))

print(f'/content/output_images has {image_count} image files.')


/content/output_images has 39 image files.


In [None]:
file_path = '/content/merged_figures_and_tables.txt'  # Replace with your file path

# Open the file and count the lines
with open(file_path, 'r') as file:
    line_count = sum(1 for line in file)

print(f"The file '{file_path}' has {line_count} lines.")


The file '/content/merged_figures_and_tables.txt' has 39 lines.


In [None]:
import os

# Directory path where your images are stored
directory_path = '/content/output_images'  # Replace with your directory path

# List all files in the directory
files = os.listdir(directory_path)

# Check for hidden files or directories
hidden_files = [file for file in files if file.startswith('.')]

# Count the number of image files (assumes image extensions like .jpg, .png, .jpeg)
image_count = sum(1 for file in files if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')))

# Print the counts and details
print(f'Total files: {len(files)}')
print(f'Image files (jpg, jpeg, png, gif, bmp, tiff): {image_count}')
print(f'Hidden files or directories: {hidden_files}')

# Check file formats and types
for file in files:
    if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')):
        print(f'{file} is an image file.')
    else:
        print(f'{file} is not an image file.')

# List files without expected image extensions
non_image_files = [file for file in files if not file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'))]
print(f'Non-image files: {non_image_files}')


Total files: 39
Image files (jpg, jpeg, png, gif, bmp, tiff): 39
Hidden files or directories: []
26.png is an image file.
18.png is an image file.
13.png is an image file.
6.png is an image file.
23.png is an image file.
28.png is an image file.
21.png is an image file.
10.png is an image file.
19.png is an image file.
11.png is an image file.
5.png is an image file.
25.png is an image file.
30.png is an image file.
1.png is an image file.
37.png is an image file.
4.png is an image file.
2.png is an image file.
22.png is an image file.
31.png is an image file.
17.png is an image file.
38.png is an image file.
14.png is an image file.
29.png is an image file.
12.png is an image file.
3.png is an image file.
20.png is an image file.
24.png is an image file.
35.png is an image file.
34.png is an image file.
27.png is an image file.
36.png is an image file.
9.png is an image file.
7.png is an image file.
16.png is an image file.
8.png is an image file.
15.png is an image file.
32.png is an

In [None]:
import os

directory_path = '/content/output_images'  # Replace with your directory path

hidden_directory = '.ipynb_checkpoints'

# Check if the hidden directory exists
if os.path.exists(os.path.join(directory_path, hidden_directory)):
    # Remove the hidden directory
    os.rmdir(os.path.join(directory_path, hidden_directory))
    print(f"Directory '{hidden_directory}' has been deleted.")
else:
    print(f"Directory '{hidden_directory}' does not exist.")


Directory '.ipynb_checkpoints' has been deleted.
