In [1]:
import fitz
import re
import os
import PyPDF2
import pandas as pd
from datetime import datetime
from wordcloud import WordCloud
import matplotlib.pyplot as plt
#from textblob import TextBlob

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [2]:
# Define paths
input_folder = "./php/casefiles/"
#output_folder = "./bps/bps_redacted"

In [3]:
data = []

In [4]:
emotion_keywords = {
    "Pain": ["pain", "pains", "painful", "hurt", "hurts", "hurting", "sore", "soreness", "ache", "discomfort"],
    "Sad": ["sad", "sadder", "saddest", "downhearted", "heartbroken", "mournful", "grief", "sorrow"],
    "Content": ["contentment", "content", "pleased", "satisfied"],
    "Anger": ["anger", "angry", "rage", "enraged", "fury", "furious","fuming"],
    "Shame": ["shame", "ashamed", "guilt", "guilty"],
    "Fear": ["fear", "fearful", "scared", "scary", "frightened", "terrified"],
    "Joy": ["joy", "joyful", "happy","cheerful", "joyous"],
    "Anxiety": ["anxiety", "anxious", "nervous", "uneasy", "restless", "apprehensive", "worry", "stress", "tense", "worried"],
    "Depressed": ["depressed", "depress", "depression"],
    "Alone": ["alone", "lonely", "loneliness", "isolated", "abandoned"]
}

In [5]:
supports_keywords = {
    "Sleep": ["sleep","slept"],
    "Nutrition": ["nutrition"],
    "Exercise": ["exercise","workout","work out"],
    "Fun": ["fun"],
    "Connection": ["connection"],
    "Warmth": ["warmth"],
    "Water": ["water"],
    "Love": ["love"],
    "Therapy": ["therapy"],
}

In [6]:
skills_keywords = {
    "Mindfulness/Meditation": ["mindful", "mindfulness", "meditate", "meditation"],
    "Distress Tolerance": ["distress", "tolerate", "tolerance"],
    "Opposite Action": ["opposite action", "opposite"],
    "Take My Meds": ["take my meds", "take meds", "take my med", "take med"],
    "Ask For Help": ["ask for help", "ask help","seek help"],
    "Improve Moment": ["improve moment", "improveme the moment"],
    "Parts Work": ["parts work", "part work","parts works", "part works"],
    "Play The Tape Thru": ["play the tape thru", "play tape thru", "play the tape through", "play tape through"],
    "Values": ["value","values"],
}


In [7]:
def extract_cravings_rating(text):
    """Extracts the first cravings/impulse rating."""
    match = re.search(r'Craving[s]?/impulse.*?:\s*(\d{1,2})/10', text, re.IGNORECASE)
    if match:
        return int(match.group(1))  # Extract the first rating as an integer
    return None

In [8]:
# Function to check if text contains any variation of each support and return matched words
def check_supports(text):
    support_found = {}
    support_matched_words = []
    
    for suppport, words in supports_keywords.items():
        found = [word for word in words if re.search(rf"\b{word}\b", text, re.IGNORECASE)]
        support_found[suppport] = bool(found)  # True/False if any word matched
        support_matched_words.extend(found)  # Collect matched words

    return support_found, ", ".join(set(support_matched_words))  # Return dict & matched words string

In [9]:
# Function to check if text contains any variation of each skill and return matched words
def check_skills(text):
    skills_found = {}
    skill_matched_words = []
    
    for skill, words in skills_keywords.items():
        found = [word for word in words if re.search(rf"\b{word}\b", text, re.IGNORECASE)]
        skills_found[skill] = bool(found)  # True/False if any word matched
        skill_matched_words.extend(found)  # Collect matched words

    return skills_found, ", ".join(set(skill_matched_words))  # Return dict & matched words string

In [10]:
# Function to check if text contains any variation of each emotion and return matched words
def check_emotions(text):
    emotions_found = {}
    emotion_matched_words = []
    
    for emotion, words in emotion_keywords.items():
        found = [word for word in words if re.search(rf"\b{word}\b", text, re.IGNORECASE)]
        emotions_found[emotion] = bool(found)  # True/False if any word matched
        emotion_matched_words.extend(found)  # Collect matched words

    return emotions_found, ", ".join(set(emotion_matched_words))  # Return dict & matched words string

In [11]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = "\n".join([page.get_text("text") for page in doc])
    return text

# Function to extract PHP Daily Assessments with only the date
def extract_php_assessments(text):
    pattern = re.findall(
        r"PHP Daily Assessment (\d{2}/\d{2}/\d{4}) \d{2}:\d{2} [APM]{2}\n(.*?)(?=\n\w+ \w+, ACSW\d+)", 
        text, 
        re.DOTALL
    )
    return pattern  # Returns list of tuples (date, text)


In [12]:
# Function to process all PDFs in a folder
def process_pdfs_in_folder(folder_path):
    all_data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):  # Process only PDF files
            pdf_path = os.path.join(folder_path, filename)
            pdf_text = extract_text_from_pdf(pdf_path)
            group_identifier = filename.split('_')[1].replace('.pdf', '')  # Extract patient ID
            
            # Extract PHP assessments
            assessments = extract_php_assessments(pdf_text)

            # Append extracted data to the list
            for date, text in assessments:
                crave_rating = extract_cravings_rating(text) #try to get the crave rating
                emotions_found, emotion_matched_words = check_emotions(text)  # Get emotion matches & words
                skills_found, skill_matched_words = check_skills(text)  # Get skill matches & words
                supports_found, support_matched_words = check_supports(text)  # Get skill matches & words
                row_data = [group_identifier, date, 
                            emotion_matched_words, 
                            skill_matched_words, 
                            support_matched_words, 
                            crave_rating] + list(emotions_found.values()) + list(skills_found.values()) + list(supports_found.values())
                all_data.append(row_data)
    
    # df columns
    columns = ["group_identifier", 
               "assessment_date",
               "Matched Emotion Words", 
               "Match Skill Words", 
               "Match Support Words",
               "Craving"] + list(emotion_keywords.keys()) + list(skills_keywords.keys())+ list(supports_keywords.keys())
    df = pd.DataFrame(all_data, columns=columns)
    df = pd.DataFrame(all_data, columns=columns)
    return df

In [13]:
# Set folder path and process all PDFs
df_php_assessments = process_pdfs_in_folder(input_folder)



In [14]:
# Display extracted data
#df_php_assessments



In [15]:
# Save to CSV if needed
df_php_assessments.to_csv("extracted_php_assessments.csv", index=False)