### Extracting the relevant data from pdfs and projects descriptions


## From Pdfs

In [1]:
import os
import re
import pandas as pd
import geopandas as gpd
from transformers import pipeline
from helper_functions import extract_text_with_pymupdf ,extract_planting_date
pdf_folder = "/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions"


model_name = "distilbert-base-cased-distilled-squad"
revision = "626af31"
qa_pipeline = pipeline("question-answering", model=model_name, revision=revision, framework="pt")

# Defining  the dates questions,
questions = [
    "What is the planting date?",
    "What is the project start date?"
]

results = []

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf") and (filename.startswith("pd_verra_") or filename.startswith("pd_goldstandard_")):
        project_id = filename.split("_")[-1].split(".")[0]
        pdf_path = os.path.join(pdf_folder, filename)
        text = extract_text_with_pymupdf(pdf_path)

        if not text.strip():
            print(f"Skipping {filename}: No text extracted.")
            continue

        project_data = {"Project_ID": project_id}

        # Processing each question separately
        for question in questions:
            try:
                result = qa_pipeline(question=question, context=text)
                project_data[question] = result["answer"]
            except Exception as e:
                print(f"Error processing {filename} for question '{question}': {e}")
                project_data[question] = "Error"


        results.append(project_data)

# Saving  the results to a CSV file
df = pd.DataFrame(results)
output_csv = os.path.join(pdf_folder, "../midsave/extracted_project_data.csv")
df.to_csv(output_csv, index=False)

print(f"Extraction completed! Data saved to {output_csv}")

Device set to use mps:0


Error reading /Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12938.pdf with PyMuPDF: Failed to open file '/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12938.pdf'.
Skipping pd_goldstandard_12938.pdf: No text extracted.
Error reading /Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12323.pdf with PyMuPDF: Failed to open file '/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12323.pdf'.
Skipping pd_goldstandard_12323.pdf: No text extracted.
Skipping pd_goldstandard_3260.pdf: No text extracted.
Error reading /Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_3025.pdf with PyMuPDF: Failed to open file '/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_3025.pdf'.
Skipping pd_goldstandard_3025.pdf: No text extracted.
Error reading /Users/angela/Documents/Forest_Moni

In [6]:


pdf_folder = "/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions"
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
bert_qa_pipeline = pipeline("question-answering", model=model_name)
questions = [
  
    
    # # Species question
   "Which species were planted? Name each mentioned Species please",
    
    # Purpose with multiple phrasings
    "What is the purpose  of the planting project?",
    "What is the aim of the planting project?",
    
    # # Community involvement
    # "Was there community involvement?",
    
    # Area planted
    "What is the total area planted?",
    
    # Tree count
    "How many trees were planted?"
]

results = []


for filename in os.listdir(pdf_folder):
     if filename.endswith(".pdf") and (filename.startswith("pd_verra_") or filename.startswith("pd_goldstandard_")):
      
        project_id = filename.split("_")[-1].split(".")[0]
        pdf_path = os.path.join(pdf_folder, filename)
        text = extract_text_with_pymupdf(pdf_path)

        if not text.strip():
            print(f"Skipping {filename}: No text extracted.")
            continue

        project_data = {"Project_ID": project_id}
        for question in questions:
            try:
                # Getting multiple possible answers with confidence scores
                answers = bert_qa_pipeline(
                    question=question,
                    context=text,
                    top_k=10,  # Get top 3 possible answers
                    handle_impossible_answer=True
                )
                
                # Selecting answer with highest confidence score
                best_answer = max(answers, key=lambda x: x['score'])
                
                # Handling multiple purposes/reasons
                if question == "What is the purpose  of the planting project?":
                    all_purposes = [ans["answer"] for ans in answers if ans["score"] > 0.5]  # Adjust threshold as needed
                    project_data[question] = "; ".join(all_purposes)
                else:
                    project_data[question] = best_answer["answer"]
                
            except Exception as e:
                print(f"Error processing {filename} for question '{question}': {e}")
                project_data[question] = "Error"

   

        results.append(project_data)

df_new2 = pd.DataFrame(results)
output_csv = os.path.join(pdf_folder, "new_improvedbert_extracted_project_data.csv")
df_new2.to_csv(output_csv, index=False)

print(f"Extraction completed! Data saved to {output_csv}")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


Error reading /Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12938.pdf with PyMuPDF: Failed to open file '/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12938.pdf'.
Skipping pd_goldstandard_12938.pdf: No text extracted.
Error reading /Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12323.pdf with PyMuPDF: Failed to open file '/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_12323.pdf'.
Skipping pd_goldstandard_12323.pdf: No text extracted.
Skipping pd_goldstandard_3260.pdf: No text extracted.
Error reading /Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_3025.pdf with PyMuPDF: Failed to open file '/Users/angela/Documents/Forest_Monitoring/midsave/project_descriptions/pd_goldstandard_3025.pdf'.
Skipping pd_goldstandard_3025.pdf: No text extracted.
Error reading /Users/angela/Documents/Forest_Moni

#### From manual annotation/text extraction results are saved to file
"/home/idisc02/Downloads/Manually_Filtered_extracted_text_sp_dates.csv




In [None]:

manual= pd.read_csv("../midsave/Manually_Filtered_extracted_text_sp_dates.csv") 


manual.info()

In [None]:
file_path = "../midsave/newest_consolidated_reforestation_projects_with_cicular.parquet"


reforestation_df= gpd.read_parquet(file_path)

In [None]:
"""Merge the extracted Columns/data to the original data"""
reforestation_project_gdf = reforestation_df.merge(
   manual[['Project_ID', 'Which species were planted? Name each mentioned Species please_y','What is the planting date?']],  
    left_on='project_id_reported', 
    right_on='Project_ID',
    how='left'  
)


reforestation_project_gdf = reforestation_project_gdf.drop(columns=['Project_ID'])

reforestation_project_gdf.head()

In [None]:
"""Cleaning the data"""
reforestation_project_gdf = reforestation_project_gdf.replace('\n', '', regex=True)
reforestation_project_gdf = reforestation_project_gdf.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
reforestation_project_gdf['What is the planting date?'] = pd.to_datetime(
    reforestation_project_gdf['What is the planting date?'],
    format='mixed',  
    errors='coerce' 
)

In [None]:

filtered_gdf = reforestation_project_gdf[reforestation_project_gdf['What is the planting date?'].notna()]

filtered_gdf = filtered_gdf.rename(columns={"What is the planting date?": "planting_date_derived"})


In [None]:
""" For projects without pdfs we extract the dates and other variables from descrition"""
others_data=reforestation_project_gdf[reforestation_project_gdf['What is the planting date?'].isna()]
others_data.info()

# Extraction From descriptions

In [None]:


others_data["planting_date_derived"] = others_data["project_description_reported"].apply(extract_planting_date)

In [None]:

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
bert_qa_pipeline = pipeline("question-answering", model=model_name)


questions = [
    "Which species were planted? Name each mentioned Species please",
    "What is the purpose of the planting project?",
    "What is the aim of the planting project?",
    "What is the total area planted?",
    "How many trees were planted?"
]


results = []


for _, row in others_data.iterrows():
    project_id = row["Project_ID"]
    context = row["project_description_reported"]

    project_data = {"Project_ID": project_id}
    for question in questions:
        try:
            # Getting multiple possible answers with confidence scores
            answers = bert_qa_pipeline(
                question=question,
                context=context,
                top_k=10,  
                handle_impossible_answer=True
            )

            # Selecting the answer with the highest confidence score
            best_answer = max(answers, key=lambda x: x['score'])

            # Handling the multiple purposes/reasons
            if question == "What is the purpose of the planting project?":
                all_purposes = [ans["answer"] for ans in answers if ans["score"] > 0.5]  # Adjust threshold as needed
                project_data[question] = "; ".join(all_purposes)
            else:
                project_data[question] = best_answer["answer"]

        except Exception as e:
            print(f"Error processing Project_ID {project_id} for question '{question}': {e}")
            project_data[question] = "Error"

    results.append(project_data)


df_results = pd.DataFrame(results)
