In [None]:
# pip installations
!pip install PyMuPDF pdfplumber openai==0.28

Collecting PyMuPDF
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m790.5 kB/s[0m eta [36m0:00:00[0m
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownlo

In [None]:
import fitz
import os
import re
import openai
import json
import pickle
import requests
import pdfplumber
import numpy as np
import pandas as pd
from io import BytesIO
from bs4 import BeautifulSoup
from google.colab import drive
from openai import ChatCompletion
from IPython.display import display, HTML
# from dotenv import load_dotenv

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Prompt Engineering

In [None]:
# Define the prompt
PROMPT = """ You are provided with a document published by the U.S. Department of Agriculture. Your task is to assess the document carefully and extract information related to the U.S. weather forecasts.
Your focus should only be on the first 4 pages of the document, where the forecasts mainly about the U.S. are provided.
Ignore any sections related to international weather forecasts or crop-specific information.

You must follow the key information mentioned in the below section when assessing the content of the document, ensuring all facts are exactly as mentioned in the report.
The output should directly reflect the sections in the document without making any assumptions, generating new data, or using placeholders.

document_context: {text}

*Key Rules*:
  - Read the entire relevant section (first 4 pages) and understand before answering, paying attention to weather conditions.
  - No Inferences or Assumptions: You need to extract only the information exactly as written in the document. Do not infer any new information or generate any assumption.
  - Try to be exact on the locations (States/Counties/Regions/Towns/Cities/Zones) and time (specific dates/weeks/months), keep blank if not mentioned.
  - Do not mention about any other weather conditons aprt from the categories specified below

*Key facts to be extracted from the document*:

1. Weather Forecast and Climate Conditions:
  - Weather conditions such as Heavy Rainfall, Flooding, Drought, Dry Conditions, High Temperatures, Heat Waves, Strong Winds, Wildfires (even unexpected/sudden weather changes that has occured or expected to occur)
  - States/Counties/Regions/Towns/Cities/Zones where Heavy Rainfall and Flooding, Drought and Dry Conditions, High Temperatures, Strong Winds, Heat Waves and Wildfires have occured or expected to occur, along with time periods such as days/weeks/months.

The final output should be in the JSON format:_

"""

In [None]:
def get_classification(prompt):
  response = openai.ChatCompletion.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": prompt}],
    temperature=0
  )
  return response.choices[0].message['content']

In [None]:
def extract_text_from_pdf(pdf_path):
  doc = fitz.open(pdf_path)
  text = ""
  for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    text += page.get_text()
  return text

def classify_text(text):
  prompt = PROMPT.format(text=text)
  return get_classification(prompt)

In [None]:
def main(pdf_path):
  text = extract_text_from_pdf(pdf_path)
  classified_data = classify_text(text)
  cleaned_json = classified_data.strip()
  if cleaned_json.startswith("```json"):
    cleaned_json = cleaned_json[7:]
  if cleaned_json.endswith("```"):
    cleaned_json = cleaned_json[:-3]

  return cleaned_json.strip()

# Run PDF
pdf_path = "/content/drive/MyDrive/FYP /AI Alignment/Test/data/US_Weather_Bulletin.pdf"
json_result = main(pdf_path)
print(json_result)

{
  "WeatherForecastAndClimateConditions": {
    "HeavyRainfall": {
      "Locations": [
        {
          "State": "New Mexico",
          "Details": "Locally heavy precipitation sparking flash flooding in parts of eastern New Mexico",
          "Date": "October 19, 2024"
        },
        {
          "State": "Washington",
          "Details": "Significant precipitation fell in western Washington",
          "Date": "October 19, 2024"
        },
        {
          "State": "Utah",
          "Details": "Precipitation totals more than an inch in several locations",
          "Date": "October 17-18, 2024"
        },
        {
          "State": "New Mexico",
          "Details": "Roswell's 5.78-inch daily sum became that city’s wettest day on record",
          "Date": "October 19, 2024"
        },
        {
          "State": "Washington",
          "Details": "Daily-record totals for October 19 in Quillayute (4.68 inches) and Bellingham (1.97 inches)",
          "Date": "October 1

In [None]:
def json_to_combined_dataframe(json_result):
    # Parse the JSON result into a dictionary if it's in string form
    if isinstance(json_result, str):
        try:
          json_result = json.loads(json_result)
        except json.JSONDecodeError as e:
          print(f"JSONDecodeError: {e}")
          return None

    combined_data = []

    # Traverse the JSON structure
    def traverse_json(category, subcategory, details, parent_path=""):
        if isinstance(details, list):
            for item in details:
                if isinstance(item, dict):
                    combined_data.append({
                        "Category": category,
                        "Details": f"{parent_path} > {subcategory} > {item}"
                    })
                else:
                    # For non-dictionary items in the list
                    combined_data.append({
                        "Category": category,
                        "Details": f"{parent_path} > {subcategory} > {item}"
                    })
        elif isinstance(details, dict):
            for key, value in details.items():
                new_path = f"{parent_path} > {subcategory}" if parent_path else subcategory
                traverse_json(category, key, value, new_path)
        else:
            combined_data.append({
                "Category": category,
                "Details": f"{parent_path} > {subcategory} > {details}"
            })

    # Iterate over each main category in the JSON
    for category, subcategory_data in json_result.items():
        if isinstance(subcategory_data, dict):
            for subcategory, details in subcategory_data.items():
                traverse_json(category, subcategory, details)
        else:
            combined_data.append({
                "Category": category,
                "Details": f"{subcategory_data}"
            })

    # Convert to pandas DataFrame
    df_combined = pd.DataFrame(combined_data)
    return df_combined


df = json_to_combined_dataframe(json_result)
df.head()
display(df)
df.to_csv('/content/drive/MyDrive/FYP /AI Alignment/Test/data/US_weather_report.csv')
df.to_pickle('/content/drive/MyDrive/FYP /AI Alignment/Test/data/US_weather_report.pkl')

Unnamed: 0,Category,Details
0,WeatherForecastAndClimateConditions,HeavyRainfall > Locations > {'State': 'New Mex...
1,WeatherForecastAndClimateConditions,HeavyRainfall > Locations > {'State': 'Washing...
2,WeatherForecastAndClimateConditions,"HeavyRainfall > Locations > {'State': 'Utah', ..."
3,WeatherForecastAndClimateConditions,HeavyRainfall > Locations > {'State': 'New Mex...
4,WeatherForecastAndClimateConditions,HeavyRainfall > Locations > {'State': 'Washing...
5,WeatherForecastAndClimateConditions,"Flooding > Locations > {'State': 'New Mexico',..."
6,WeatherForecastAndClimateConditions,DroughtAndDryConditions > Locations > {'State'...
7,WeatherForecastAndClimateConditions,DroughtAndDryConditions > Locations > {'State'...
8,WeatherForecastAndClimateConditions,DroughtAndDryConditions > Locations > {'State'...
9,WeatherForecastAndClimateConditions,DroughtAndDryConditions > Locations > {'State'...
