In [None]:
!pip install pypdfium2



### Code for image enhancement

In [None]:
import os
import pypdfium2 as pdfium
from PIL import Image, ImageEnhance
import cv2
import numpy as np
from skimage.metrics import structural_similarity as ssim
import pandas as pd

def enhance_image(image_path):
    # Load the image using PIL
    image = Image.open(image_path)

    # Convert the image to a NumPy array for OpenCV processing
    image_cv = np.array(image)

    # Apply sharpening using a kernel
    kernel = np.array([[0, -1, 0],
                       [-1, 5, -1],
                       [0, -1, 0]])
    sharpened = cv2.filter2D(image_cv, -1, kernel)

    # Denoise the image
    denoised = cv2.fastNlMeansDenoisingColored(sharpened, None, 10, 10, 7, 21)

    # Convert back to PIL image
    enhanced_image = Image.fromarray(denoised)

    # Adjust contrast
    enhancer = ImageEnhance.Contrast(enhanced_image)
    enhanced_image = enhancer.enhance(1.5)  # Increase contrast

    return enhanced_image

def calculate_metrics(original_path, enhanced_image):
    original_image = cv2.imread(original_path)
    enhanced_image_cv = np.array(enhanced_image)

    # Convert images to grayscale for SSIM calculation
    original_gray = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
    enhanced_gray = cv2.cvtColor(enhanced_image_cv, cv2.COLOR_BGR2GRAY)

    # Calculate PSNR
    psnr_value = cv2.PSNR(original_image, enhanced_image_cv)

    # Calculate SSIM
    ssim_value = ssim(original_gray, enhanced_gray)

    return psnr_value, ssim_value
count = 0
# Specify the folder containing the PDF files
pdf_folder = "/content/drive/MyDrive/QoL questionnaire testing/WorkspacesDownload_2024-09-20-10-37-51"
output_base_folder = "/content/drive/MyDrive/output_images_from_pdf_150"
google_drive_folder = "/content/drive/MyDrive/enhanced_output_images_from_pdf_150"

# Create the base output folders if they don't exist
os.makedirs(output_base_folder, exist_ok=True)
os.makedirs(google_drive_folder, exist_ok=True)

# Create a DataFrame to store PSNR and SSIM values
metrics_data = []

# Loop through each PDF in the folder
for root, _, files in os.walk(pdf_folder):
    for pdf_file in files:
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(root, pdf_file)
            pdf = pdfium.PdfDocument(pdf_path)

            # Create a corresponding folder in Google Drive
            relative_path = os.path.relpath(root, pdf_folder)  # Get the relative path
            output_folder = os.path.join(google_drive_folder, relative_path, os.path.splitext(pdf_file)[0])
            os.makedirs(output_folder, exist_ok=True)
            count+=1
            print(count)
            # Loop over pages and render
            for i in range(len(pdf)):
                page = pdf[i]
                image = page.render(scale=4).to_pil()
                image_path = os.path.join(output_folder, f"output_{i:03d}.png")
                image.save(image_path)  # Save the original image

                # Enhance the image
                enhanced_image = enhance_image(image_path)

                # Save the enhanced image in Google Drive, replacing the original
                enhanced_image.save(image_path)

                # Calculate metrics
                #psnr_value, ssim_value = calculate_metrics(image_path, enhanced_image)

                # Store the metrics in the list
                #metrics_data.append({
                #    "Image": os.path.basename(image_path),
                #    "PSNR": psnr_value,
                #    "SSIM": ssim_value
                #})

# Convert the list to a DataFrame
metrics_df = pd.DataFrame(metrics_data)

# Print the DataFrame
print(metrics_df)

# Save the DataFrame to a CSV file
output_csv_path = "/content/drive/MyDrive/psnr_ssim_values.csv"
metrics_df.to_csv(output_csv_path, index=False)

print(f"Metrics saved to {output_csv_path}")
print("Conversion and enhancement completed!")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
Empty DataFrame
Columns: []
Index: []
Metrics saved to /content/drive/MyDrive/psnr_ssim_values.csv
Conversion and enhancement completed!


###10 pdfs conversion


In [None]:
import os
import re
import pandas as pd
from PIL import Image
from google.generativeai.types import HarmCategory, HarmBlockThreshold
# Specify the folder containing the PDF files
pdf_folder = "/content/drive/MyDrive/Medical_Questionnaire_Dataset"
output_base_folder = "/content/drive/MyDrive/output_images_from_pdf_test"
extracted_output_folder = "/content/drive/MyDrive/Extracted_outputs"
# Create the Extracted_output folder if it doesn't exist
os.makedirs(extracted_output_folder, exist_ok=True)

# Create the base output folder if it doesn't exist
os.makedirs(output_base_folder, exist_ok=True)
# Initialize the generative model
vision_model = genai.GenerativeModel('gemini-1.5-flash')
# Loop through each folder in the output_images direcory
for pdf_folder in os.listdir(output_base_folder):
    folder_path = os.path.join(output_base_folder, pdf_folder)
    print(folder_path)
    if os.path.isdir(folder_path):  # Check if it's a directory

        # Initialize a list to hold all data for this PDF
        all_data = []

        # Loop through each image in the folder
        for image_file in sorted(os.listdir(folder_path)):  # Sort to process in order
            if image_file.endswith(".png"):
                image_path = os.path.join(folder_path, image_file)

                # Load the image
                image = Image.open(image_path)

                # Analyze the image using the generative model
                try:
                    response = vision_model.generate_content([
                        "Analyze this image. For each question in this questionaire, if the answer is marked either circled or ticked, provide the question and the selected answer as 1,2,3,4 in a table with two columns: 'Question' along with question number and 'Selected Choice'. Dont give answers of those questions which are not marked, leave them empty. Try to make it as accurate as possible please", image],
                    safety_settings = [
                                {
                                    "category": "HARM_CATEGORY_HARASSMENT",
                                    "threshold": "BLOCK_NONE",
                                },
                                {
                                    "category": "HARM_CATEGORY_HATE_SPEECH",
                                    "threshold": "BLOCK_NONE",
                                },
                                {
                                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                                    "threshold": "BLOCK_NONE",
                                },
                                {
                                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                                    "threshold": "BLOCK_NONE",
                                },
                            ]

                    )
                    # Check if the response is valid
                    print(response.candidates)
                    if not response.candidates or not response.text:
                        print(f"No valid content returned for {image_file}")
                        continue

                    # Extract table data from the response text using regex
                    rows = re.findall(r'\|\s*(.*?)\s*\|\s*(.*?)\s*\|', response.text)

                    if not rows:
                        continue

                    df = pd.DataFrame(rows[1:], columns=rows[0])
                    all_data.append(df)
                except Exception as e:
                    print(f"Error processing {image_file}: {e}")

        # Combine all the DataFrames for this PDF into a single DataFrame

        if all_data:
            final_df = pd.concat(all_data, ignore_index=True)

            # Standardize column names: Rename all variations to 'Selected Choice'
            final_df.columns = [re.sub(r'(?i)(Choice|Not at all)', 'Selected Choice', col) for col in final_df.columns]

            # Save the DataFrame to an Excel file with the same name as the PDF folder
            excel_filename = os.path.join(extracted_output_folder, f"{pdf_folder}.xlsx")
            final_df.to_excel(excel_filename, index=False)


print("Processing completed!")


/content/drive/MyDrive/output_images_from_pdf_test/108010 Arm A1 QoL 21.09.22
[content {
  parts {
    text: "| Question | Selected Choice |\n|---|---|\n| 1. Do you have any trouble doing strenuous activities, like carrying a heavy shopping bag or suitcase? | 4 |\n| 2. Do you have any trouble taking a long walk? | 3 |\n| 3. Do you have any trouble taking a short walk outside the house? | 3 |\n| 4. Do you need to stay in a bed or a chair during the day? | 3 |\n| 5. Do you need help with eating, dressing, washing yourself or using the toilet? | 2 |\n| 6. Were you limited in doing either your work or other daily activities? | 4 |\n| 7. Were you limited in pursuing your hobbies or other leisure time activities? | 4 |\n| 8. Were you short of breath? |  |\n| 9. Have you had pain? | 1 |\n| 10. Did you need to rest? | 3 |\n| 11. Have you had trouble sleeping? | 2 |\n| 12. Have you felt weak? | 3 |\n| 13. Have you lacked appetite? | 3 |\n| 14. Have you felt nauseated? | 3 |\n| 15. Have you vomi

In [None]:
import os
import pypdfium2 as pdfium

# Specify the folder containing the PDF files
pdf_folder = "/content/drive/MyDrive/Medical_Questionnaire_Dataset"
output_base_folder = "/content/drive/MyDrive/output_images_from_pdf_test"

# Create the base output folder if it doesn't exist
os.makedirs(output_base_folder, exist_ok=True)

# Loop through each PDF in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        # Get the PDF file path
        pdf_path = os.path.join(pdf_folder, pdf_file)

        # Load the document
        pdf = pdfium.PdfDocument(pdf_path)

        # Create a folder for the images of this PDF
        output_folder = os.path.join(output_base_folder, os.path.splitext(pdf_file)[0])
        os.makedirs(output_folder, exist_ok=True)

        # Loop over pages and render
        for i in range(len(pdf)):
            page = pdf[i]
            image = page.render(scale=4).to_pil()

            # Save the image to the corresponding folder
            image.save(os.path.join(output_folder, f"output_{i:03d}.png"))

print("Conversion completed!")


Conversion completed!


In [None]:
# Install Google's Gemini Libraries
!pip install -q -U google-generativeai


#Import libraries
import google.generativeai as genai
from google.colab import userdata
from IPython.display import display
from IPython.display import Markdown
import PIL.Image

# configure api key and initialise model
from google.colab import userdata
import os


In [None]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = userdata.get('api_key')

genai.configure(api_key=os.environ['GOOGLE_API_KEY'])



In [None]:
import os
import re
import pandas as pd
from PIL import Image
from google.generativeai.types import HarmCategory, HarmBlockThreshold
# Set up the base directories
output_base_folder = "/content/drive/MyDrive/output_images_from_pdf"
extracted_output_folder = "/content/drive/MyDrive/Extracted_outputs_excel_24_09"

# Create the Extracted_output folder if it doesn't exist
os.makedirs(extracted_output_folder, exist_ok=True)

# Initialize the generative model
vision_model = genai.GenerativeModel('gemini-1.5-flash')

# Loop through each folder in the output_images directory
for pdf_folder in os.listdir(output_base_folder):
    folder_path = os.path.join(output_base_folder, pdf_folder)
    if os.path.isdir(folder_path):  # Check if it's a directory

        # Initialize a list to hold all data for this PDF
        all_data = []

        # Loop through each image in the folder
        for image_file in sorted(os.listdir(folder_path)):  # Sort to process in order
            if image_file.endswith(".png"):
                image_path = os.path.join(folder_path, image_file)

                # Load the image
                image = Image.open(image_path)

                # Analyze the image using the generative model
                try:
                  response = vision_model.generate_content([
                      "Analyze this image and provide each question along with its selected choice either circled or ticked in the form of a table. Note: Dont give answers of those questions which are not marked, leave them empty", image],
                      safety_settings = [
                                {
                                    "category": "HARM_CATEGORY_HARASSMENT",
                                    "threshold": "BLOCK_NONE",
                                },
                                {
                                    "category": "HARM_CATEGORY_HATE_SPEECH",
                                    "threshold": "BLOCK_NONE",
                                },
                                {
                                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                                    "threshold": "BLOCK_NONE",
                                },
                                {
                                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                                    "threshold": "BLOCK_NONE",
                                },
                            ]

                  )

                  # Check if the response is valid
                  if not response.candidates or not response.text:
                      print(f"No valid content returned for {image_file}")
                      continue

                  # Extract table data from the response text using regex
                  rows = re.findall(r'\|\s*(.*?)\s*\|\s*(.*?)\s*\|', response.text)

                  if not rows:
                      continue

                  df = pd.DataFrame(rows[1:], columns=rows[0])
                  all_data.append(df)
                except Exception as e:
                  print(f"Error processing {image_file}: {e}")

        # Combine all the DataFrames for this PDF into a single DataFrame
        if all_data:
            final_df = pd.concat(all_data, ignore_index=True)

            # Save the DataFrame to an Excel file with the same name as the PDF folder
            excel_filename = os.path.join(extracted_output_folder, f"{pdf_folder}.xlsx")
            final_df.to_excel(excel_filename, index=False)
            print('done')

print("Processing completed!")


done
done
done
done
done
done
Error processing output_000.png: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
Error processing output_000.png: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
Processing completed!


In [None]:
import os
import re
import pandas as pd
from PIL import Image
from google.generativeai.types import HarmCategory, HarmBlockThreshold
# Set up the base directories
output_base_folder = "/content/drive/MyDrive/output_images_from_pdf"
extracted_output_folder = "/content/drive/MyDrive/057100 Arm A1 QoL 04.02.22 20"

# Create the Extracted_output folder if it doesn't exist
os.makedirs(extracted_output_folder, exist_ok=True)

# Initialize the generative model
vision_model = genai.GenerativeModel('gemini-1.5-pro')
pdf_folder = "057100 Arm A1 QoL 04.02.22 20"

folder_path = os.path.join(output_base_folder, pdf_folder)
if os.path.isdir(folder_path):  # Check if it's a directory

    # Initialize a list to hold all data for this PDF
    all_data = []

    # Loop through each image in the folder
    for image_file in sorted(os.listdir(folder_path)):  # Sort to process in order
        if image_file.endswith(".png"):
            image_path = os.path.join(folder_path, image_file)

            # Load the image
            image = Image.open(image_path)

            # Analyze the image using the generative model
            try:
              response = vision_model.generate_content([
                  "Analyze this image and provide each question along with its selected choice either circled or ticked in the form of a table.", image],
                  safety_settings = [
                            {
                                "category": "HARM_CATEGORY_HARASSMENT",
                                "threshold": "BLOCK_NONE",
                            },
                            {
                                "category": "HARM_CATEGORY_HATE_SPEECH",
                                "threshold": "BLOCK_NONE",
                            },
                            {
                                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                                "threshold": "BLOCK_NONE",
                            },
                            {
                                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                                "threshold": "BLOCK_NONE",
                            },
                        ]

              )
              print(response.text)
              # Check if the response is valid
              if not response.candidates or not response.text:
                  print(f"No valid content returned for {image_file}")
                  continue

              # Extract table data from the response text using regex
              rows = re.findall(r'\|\s*(.*?)\s*\|\s*(.*?)\s*\|', response.text)

              if not rows:
                  continue

              df = pd.DataFrame(rows[1:], columns=rows[0])
              all_data.append(df)
            except Exception as e:
              print(f"Error processing {image_file}: {e}")

    # Combine all the DataFrames for this PDF into a single DataFrame
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)

        # Save the DataFrame to an Excel file with the same name as the PDF folder
        excel_filename = os.path.join(extracted_output_folder, f"{pdf_folder}.xlsx")
        final_df.to_excel(excel_filename, index=False)
        print('done')

print("Processing completed!")


## STAMPEDE EORTC QLQ-30 QUALITY OF LIFE FORM Analysis:

| Question | Not at all | A little | Quite a bit | Very much | 
|---|---|---|---|---|
| **1. Do you have any trouble doing strenuous activities, like carrying a heavy shopping bag or suitcase?** | ⓵ | 2 | 3 | 4 |
| **2. Do you have any trouble taking a long walk?** | ⓵ | 2 | 3 | 4 |
| **3. Do you have any trouble taking a short walk outside the house?** | ⓵ | 2 | 3 | 4 |
| **4. Do you need to stay in bed or a chair during the day?** | ⓵ | 2 | 3 | 4 |
| **5. Do you need help with eating, dressing, washing yourself or using the toilet?** | ⓵ | 2 | 3 | 4 |
| **During the past week:** | | | | |
| **6. Were you limited in doing either your work or other daily activities?** | ⓵ | 2 | 3 | 4 |
| **7. Were you limited in pursuing your hobbies or other leisure time activities?** | ⓵ | 2 | 3 | 4 |
| **8. Were you short of breath?** | ⓵ | 2 | 3 | 4 |
| **9. Have you had pain?** | ⓵ | 2 | 3 | 4 |
| **10. Did you need to rest?** | 1 | 2 | ③ | 



Error processing output_004.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
done
Processing completed!
