# Qualitative Data Analyzer

This notebook extracts and analyzes paragraphs from a PDF containing a keyword and classifies their tone using OpenAI.

In [1]:
import os
import fitz  # PyMuPDF
import csv
import pandas as pd
from openai import OpenAI
from keys import open_ai_api_key

client = OpenAI(api_key=open_ai_api_key)

In [2]:
import ipywidgets as widgets
from IPython.display import display

pdf_path_widget = widgets.Text(value='PDF 1.pdf', description='PDF Path:')
keyword_widget = widgets.Text(value='gender', description='Keyword:')
author_statement_widget = widgets.Text(
    value='gender has an impact on the level of cybersickness experienced by users of immersive technology',
    description='Author Statement:',
    layout=widgets.Layout(width='80%')
)
display(pdf_path_widget, keyword_widget, author_statement_widget)

ModuleNotFoundError: No module named 'ipywidgets'

In [None]:
def extract_paragraphs(text, client):
    prompt = (
        "Split the following text into paragraphs. "
        "After each paragraph, output the marker <P>.\n\n"
        + text
    )
    paragraphs = []
    buffer = ""
    with client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        temperature=0
    ) as stream:
        for chunk in stream:
            if hasattr(chunk, "choices") and chunk.choices:
                delta = chunk.choices[0].delta
                if hasattr(delta, "content") and delta.content:
                    buffer += delta.content
    while "<P>" in buffer:
        para, buffer = buffer.split("<P>", 1)
        para = para.strip()
        if para:
            paragraphs.append(para)
    return paragraphs

def classify_tone(paragraph, author_statement, client):
    prompt = (
        f"Classify the tone of the following paragraph with respect to the statement: {author_statement}\n\n"
        "There are three possible tones:\n"
        "1. Supportive: The paragraph affirms or supports the presence of an effect, relationship, or influence of the keyword.\n"
        "2. Neutral: The paragraph mentions the keyword without taking a stance.\n"
        "3. Opposing: The paragraph indicates no effect or contradictory evidence.\n\n"
        f"Paragraph: {paragraph}\n\n"
        "Respond with: Supportive, Neutral, or Opposing, and explain why."
    )
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful academic tone analysis assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    return response.choices[0].message.content.strip()

In [None]:
pdf_path = pdf_path_widget.value
keyword = keyword_widget.value
author_statement = author_statement_widget.value

doc = fitz.open(pdf_path)
all_paragraphs = []
for page in doc:
    page_text = page.get_text("text")
    paragraphs = extract_paragraphs(page_text, client)
    all_paragraphs.extend(paragraphs)

keyword_paragraphs = [p for p in all_paragraphs if keyword.lower() in p.lower()]
print(f"Found {len(keyword_paragraphs)} paragraphs containing the keyword '{keyword}'.")

results = []
for i, para in enumerate(keyword_paragraphs):
    tone = classify_tone(para, author_statement, client)
    results.append({
        "Paragraph #": i+1,
        "Keyword": keyword.capitalize(),
        "Paragraph": para,
        "Tone": tone
    })

df = pd.DataFrame(results)
df

In [None]:
df.to_excel("paragraphs_with_tone.xlsx", index=False)
print("Results saved to paragraphs_with_tone.xlsx")