In [None]:
import pandas as pd
import numpy as np
import llmcode
from IPython.display import HTML, Markdown, display
import getpass
import os
import html
import lxml
import re
import plotly.graph_objects as go
import plotly.express as px
import textwrap
import random
import math
from collections import Counter, defaultdict

Init the LLMCode library. Set the llm_API variable to "Aalto" to use Aalto's GDPR-safe Azure OpenAI API endpoints that are suitable for processing confidential data. Here, we use the OpenAI API because it is faster and makes this notebook usable for people outside Aalto.

When you run the code, it will ask you to input an appropriate API key.

In [None]:
llm_API="OpenAI"
if llm_API=="OpenAI":
    if os.environ.get("OPENAI_API_KEY") is None:
        print("Please input an OpenAI API key")
        api_key = getpass.getpass()
        os.environ["OPENAI_API_KEY"] = api_key
elif llm_API=="Aalto":
    if os.environ.get("AALTO_OPENAI_API_KEY") is None:
        print("Please input an Aalto OpenAI API key")
        api_key = getpass.getpass()
        os.environ["AALTO_OPENAI_API_KEY"] = api_key
else:
    print(f"Invalid API type: {llm_API}")
llmcode.init(API=llm_API)

In [None]:
# Jupyter is already running an asyncio event loop => need this hack for async OpenAI API calling
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Choose the GPT model to use
gpt_model = "gpt-4o-mini"

# Coding with LLMs

## Preparing the data

Below, first write the research question you would like to answer by analysing your dataset.

In [None]:
research_question = "How do people experience games as art?"

Open a Word file containing texts separated by "-----" (five dashes) that are human-coded using comments. You may either provide a path to the file or leave file_path to None, in which case the system will prompt you to select the file yourself.

In [None]:
file_path = "test_data/bopp_test_augmented_joel.docx"
df = llmcode.open_docx_and_process_codes(file_path)

# Limit dataset to last coded instance, in case dataset is not fully coded
def contains_coded_pattern(text):
    return bool(re.search(r"\*\*.*?<sup>.*?</sup>", text))
df['is_coded'] = df['coded_text'].apply(contains_coded_pattern)
last_coded_index = df[df['is_coded']].index.max()
df = df.loc[:last_coded_index].drop(columns=['is_coded'])
print(f"Using {len(df)} coded instances")

# Remove leading and trailing whitespace in dataset to simplify analysis
df['text'] = df['text'].str.strip()
df['coded_text'] = df['coded_text'].str.strip()

Let's inspect a sample of the texts, which are annotated using the Markdown format \*\*highlight\*\*\<sup>codes separated by a semicolon\</sup>:

In [None]:
n_samples = 5
for _, row in df.sample(n_samples).iterrows():
    display(Markdown(row.coded_text.replace("\n", "<br/>")))  # Render newlines correctly
    display(Markdown("---"))

Let's inspect the codes in the human-annotated texts:

In [None]:
def print_all_codes(df):
    codes = [code for coded_text in df.coded_text for _, code in llmcode.parse_codes(coded_text)]
    code_counts = Counter(codes)
    for code in sorted(code_counts):
        print(f"{code} ({code_counts[code]})")

print("\nHUMAN-ANNOTATED CODES:\n")
print_all_codes(df)

## Basic example

Feel free to investigate how changes to the prompt affect the LLM output.

In [None]:
# TODO

prompt = """I will give you a game experience description from a qualitative research experiment about experiencing video games as art. 

Please carry out the following task:
- Identify and code statements about the subjective experience, if there are any. 
- Respond by repeating the original text, but highlighting the coded statements by surrounding the statements with double asterisks, as if they were bolded text in a Markdown document.
- Include the associated code(s) immediately after the statement, separated by a semicolon and enclosed in <sup></sup> tags, as if they were superscript text in a Markdown document.
- Ignore other text, e.g., text that only describes the game but not the player's subjective experience.
- Preserve exact formatting of the original text. Do not correct typos or remove unnecessary spaces."""

## Inductive coding using few-shot examples

Below, we first choose a couple of few-shot examples that we use to teach the LLM our coding style. Using a prompt and these examples, we instruct it to code the rest of the dataset. Finally, we compare the LLM-coded texts to human-coded ones.

We use the code_inductively() function from the LLMCode package, which codes a list of texts given a research question and a DataFrame containing few-shot examples. The function prompts the LLM with batches of the text, which is faster than only prompting one text at a time. The function also attempts to correct some common errors that the LLM may make, such as correcting typos or omitting some non-coded sentences of the original text. For further analyses, we want the LLM to preserve the exact formatting of the original text.

In [None]:
# Define number of few-shot examples
n_examples = 8

# Ensure that few-shot examples are excluded from the input texts
few_shot_examples = df.sample(n=n_examples, random_state=1)
df_input = df.drop(few_shot_examples.index).reset_index(drop=True)
few_shot_examples = few_shot_examples.reset_index(drop=True)

In [None]:
# Perform inductive coding
coded_texts_ind = llmcode.code_inductively(
    texts=df_input.text.tolist(),
    research_question=research_question,
    few_shot_examples=few_shot_examples,
    gpt_model=gpt_model
)

# Compare a sample of LLM codes to human codes
def print_code_sample(df_input, llm_coded_texts, n):
    random_sample = random.sample(list(zip(llm_coded_texts, df_input.coded_text.tolist())), n)
    for llm_coded, human_coded in random_sample:
        print("\nLLM CODED:")
        display(Markdown(llm_coded))
        print("\nHUMAN CODED:")
        display(Markdown(human_coded))
        display(Markdown("---"))

print_code_sample(df_input, coded_texts_ind, 5)

In [None]:
# Print all codes in few-shot examples
print("\nCODES IN FEW SHOT EXAMPLES:\n")
print_all_codes(few_shot_examples)

In [None]:
# Parse all codes and highlights in LLM output
code_highlights_ind = llmcode.get_codes_and_highlights(coded_texts_ind)

# Print a list of all LLM-generated codes
print("\nALL LLM-GENERATED CODES AND THEIR COUNTS:\n")
for code, highlights in sorted(code_highlights_ind.items()):
    print(f"{code} ({len(highlights)})")

Let's visualise the LLM codes with the help of word embeddings, which capture the meaning of words and can therefore be used to explore code similarities. Word embeddings are typically high dimensional vectors, so we will reduce the dimensionality to 2 in order to plot them in 2d. We create an interactive visualisation of the code embeddings using Plotly, so that you can hover over the plot to reveal the code name and an example of an associated highlight from the texts. The size of each marker corresponds to the number of annotations for that code in the text. Can you tell a difference between the counts of the codes included in the few-shot examples (in blue) and codes generated by the LLM (in red)?

In [None]:
# Parse all codes with respective highlights in human-annotated input texts, for comparison
human_code_highlights = llmcode.get_codes_and_highlights(df_input.coded_text)

# Prepare list of codes in the few-shot examples
few_shot_codes = set(code for coded_text in few_shot_examples.coded_text for _, code in llmcode.parse_codes(coded_text))

In [None]:
# A context string helps the embedding model disambiguate code labels in this specific context
embedding_context = f", in the context of the research question: {research_question}"
embedding_model = "text-embedding-3-small"

In [None]:
def prepare_code_vis_df(code_highlights, human_code_highlights, few_shot_codes, embedding_context, embedding_model):
    # Find code embeddings for all codes
    all_codes = set(code_highlights.keys()).union(set(human_code_highlights.keys()))
    df_em = llmcode.get_2d_code_embeddings(list(all_codes), embedding_context, embedding_model)
    
    # Create DataFrame of LLM-generated codes
    df_llm = pd.DataFrame([(c,) for c in code_highlights.keys()], columns=["code"])
    df_llm["code_count"] = df_llm["code"].apply(lambda code: len(code_highlights[code]))
    df_llm["example"] = df_llm["code"].apply(lambda code: code_highlights[code][0])
    df_llm["group"] = df_llm.code.apply(lambda code: "LLM code (few-shot)" if code in few_shot_codes else "LLM code")
    
    # Create DataFrame of human-generated codes
    df_human = pd.DataFrame([(c,) for c in human_code_highlights.keys()], columns=["code"])
    df_human["code_count"] = df_human["code"].apply(lambda code: len(human_code_highlights[code]))
    df_human["example"] = df_human["code"].apply(lambda code: human_code_highlights[code][0])
    df_human["group"] = "Human code"
    
    # Concatenate code DataFrames and merge with embeddings
    df_em_codes = pd.concat([df_llm, df_human])
    df_em_codes = df_em_codes.merge(df_em, on="code", validate="many_to_one")
    return df_em_codes

In [None]:
def visualise_2d_embeddings(df_em):
    # Prepare labels for visualisation
    hover_texts = []
    colors = []  # List to store color categories
    for _, row in df_em.iterrows():
        text = f"{row.code} ({row.code_count})</br></br>"

        # Add an example of a code highlight
        text += '"' + "</br>".join(textwrap.wrap(row.example, width=60)) + '"'
        
        hover_texts.append(text)

        # Determine color category based on group
        colors.append(row.group)
    
    df_vis = pd.DataFrame()
    df_vis["Hover"] = hover_texts
    df_vis["Size"] = [c / df_em["code_count"].max() for c in df_em["code_count"]]
    df_vis["x"] = df_em["code_2d_0"]
    df_vis["y"] = df_em["code_2d_1"]
    df_vis["Color"] = colors

    # Plot the codes in 2D
    fig = px.scatter(df_vis,
                     width=1000, height=800,
                     x="x",
                     y="y",
                     size="Size",
                     color="Color",  # Set color categories
                     hover_name="Hover",
                     title="Codes Visualised in 2D")

    
    fig.show()

In [None]:
df_em = prepare_code_vis_df(
    code_highlights_ind,
    human_code_highlights,
    few_shot_codes,
    embedding_context,
    embedding_model
)
visualise_2d_embeddings(df_em)

## Inductive coding with code consistency

Below, we run a similar experiment to the above but with code consistency. With this setting, the texts are processed sequentially, to allow the reuse of codes between text instances instead of creating possibly redundant new and only slightly different codes for each text. The system does this by keeping track of a list of previous codes that is added as input to each prompt.

In [None]:
# Perform inductive coding with code consistency
coded_texts_ind_con, code_descriptions_ind_con = llmcode.code_inductively_with_code_consistency(
    texts=df_input.text.tolist(),
    research_question=research_question,
    few_shot_examples=few_shot_examples,
    gpt_model=gpt_model
)

# Compare LLM codes to human codes
print_code_sample(df_input, coded_texts_ind_con, 5)

Let's again print the list of all generated codes. The above function also generates a description for each code in order to prevent the generation of duplicate codes with the same meaning.

In [None]:
# Parse all codes and highlights in LLM output
code_highlights_ind_con = llmcode.get_codes_and_highlights(coded_texts_ind_con)

# Print all LLM-created codes
print("\nLLM-GENERATED CODES:\n")
for code, highlights in sorted(code_highlights_ind_con.items()):
    print(f"{code} ({len(highlights)}): {code_descriptions_ind_con[code]}")

Let's visualise the codes again. Compare these codes to the codes in the previous example, for example in terms of:
- Redundancy
- Alignment with human codes and few-shot codes
- Coverage of what you see as the most important topics
- Creativity

In [None]:
df_em = prepare_code_vis_df(
    code_highlights_ind_con,
    human_code_highlights,
    few_shot_codes,
    embedding_context,
    embedding_model
)
visualise_2d_embeddings(df_em)

## Deductive coding

In this section, we’ll demonstrate how to apply deductive qualitative coding using a LLM. Deductive coding involves applying predefined codes to a dataset, in this case using human-annotated examples as our codebook.

In [None]:
# Use all human-annotated codes as the codebook
codebook = [(code,) for coded_text in df.coded_text for _, code in llmcode.parse_codes(coded_text)]

In [None]:
# Deductive coding
coded_texts_ded = llmcode.code_deductively(
    texts=df_input.text.tolist(),
    research_question=research_question,
    codebook=codebook,
    few_shot_examples=few_shot_examples,
    gpt_model=gpt_model
)

# Compare LLM codes to human codes
print_code_sample(df_input, coded_texts_ded, 5)

In [None]:
# Parse all codes and highlights in LLM output
code_highlights_ded = llmcode.get_codes_and_highlights(coded_texts_ded)

# Print all LLM-created codes
print("\nLLM-GENERATED CODES:\n")
for code, highlights in sorted(code_highlights_ded.items()):
    print(f"{code} ({len(highlights)})")

In [None]:
df_em = prepare_code_vis_df(
    code_highlights_ded,
    human_code_highlights,
    [],
    embedding_context,
    embedding_model,
)
visualise_2d_embeddings(df_em)

# Evaluating LLM coding

## Comparing similarity of LLM- and human-generated codes

Next, we investigate the similarity between LLM and human-generated codes. We will be comparing the results for each of the three approaches to LLM coding carried out above.

In [None]:
coded_texts_by_method = {
    "Inductive": coded_texts_ind,
    "Inductive with code consistency": coded_texts_ind_con,
    "Deductive": coded_texts_ded
}

In the relevant text extraction notebook, we evaluated the output based on an IoU measure that calculates the overlap between LLM- and human-highlighted parts of the text. A similar evaluation could be carried on the segments that are coded using methods in this notebook.

In this section, we evaluate the code labels assigned to the texts. Note that the coded parts of the text may vary between annotators–as measured by IoU–which makes a highlight level evaluation of codes difficult. In order to address this issue, we compare the codes on a text (as opposed to highlight) level.

In [None]:
def merge_codes(coded_text):
    codes = set(code for _, code in llmcode.parse_codes(coded_text))
    return "; ".join(codes)

def get_llm_and_human_codes_by_text(df_input, coded_texts):
    """
    Prepare a DataFrame containing LLM and human-generated codes for the same texts for comparison by merging all codes for each text
    """
    data = []
    for idx, row in df_input.iterrows():
        text = row.text
        llm_coded_text = coded_texts[idx]
        human_coded_text = row.coded_text

        # For each text, extract all LLM and human-generated codes
        llm_codes = merge_codes(llm_coded_text) if llm_coded_text else None
        human_codes = merge_codes(human_coded_text) if human_coded_text else None
        
        data.append((text, llm_codes, human_codes))
    return pd.DataFrame(data, columns=["text", "codes", "human_codes"])

The gpt_human_code_dist() function from the LLMCode package uses text embeddings with a modified [Hausdorff distance measure](https://en.wikipedia.org/wiki/Hausdorff_distance) to evaluate the semantic similarity between the LLM and human-generated codes. Compare the average distance scores for each of the three methods (lower distance = more similar).

In [None]:
distances_by_method = {}

for method, coded_texts in coded_texts_by_method.items():
    print(f"Method: {method}")
    
    # Combine LLM- and human-coded texts in one DataFrame
    df_codes = get_llm_and_human_codes_by_text(df_input, coded_texts)

    # Calculate semantic distances using the LLMCode package
    df_dist = llmcode.gpt_human_code_dist(df_codes, embedding_context, embedding_model)
    distances_by_method[method] = df_dist

    # Report any texts that couldn't be coded by the LLM, as these are ignored from the mean calculation
    nan_count = np.isnan(df_dist.dist).sum()
    if nan_count > 0:
        print(f"WARNING: Excluding {nan_count} uncoded instances from analysis")

    # Calculate and display the average semantic distance across all (included) texts
    # Ignore any np.nan values for non-coded texts
    avg_dist = np.nanmean(df_dist.dist)
    print(f"Average distance: {avg_dist}\n")

The output DataFrame rows are sorted from most similar to least similar. Inspect the DataFrames to observe what kind of texts the different LLM coding methods excel and struggle with.

In [None]:
distances_by_method["Inductive"]

In [None]:
distances_by_method["Inductive with code consistency"]

In [None]:
distances_by_method["Deductive"]

In [None]:
# TODO: Table with all results?

## Comparing diversity of LLM- and human-generated codes

Both the IoU and Hausdorff distance metrics measure how well the LLM-generated codes align with human-generated codes. However, given the subjective nature of qualitative coding, one may also be interested in comparing the diversity of useful codes generated by the LLM methods. Some of these might be new codes that do not exist in the human-annotated data. One way to carry out such an analysis is to evaluate the codes visually, using the plots we generated at the end of each coding section.

# Storing the results

Run the following code to store your codes in a directory

In [None]:
coded_texts_by_method = {
    "Inductive": locals().get("coded_texts_ind"),
    "Inductive with code consistency": locals().get("coded_texts_ind_con"),
    "Deductive": locals().get("coded_texts_ded")
}

# Create output dir
output_dir = "coding_output"  # TODO: Prompt from user
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save raw and coded texts for each method
for name, coded_texts in coded_texts_by_method.items():
    if coded_texts is not None:
        data = [(t, t_coded) for t, t_coded in zip(df_input.text.tolist(), coded_texts)]
        df_out = pd.DataFrame(data, columns=["text", "coded_text"])
        file_path = f"{output_dir}/coded_texts_{name.lower().replace(" ", "_")}.csv"
        df_out.to_csv(file_path, index=False)

# Save code descriptions for ind con
if locals().get("code_descriptions_ind_con") {
    data = code_descriptions_ind_con.items()
    df_out = pd.DataFrame(data, columns=["code", "description"])
    file_path = f"{output_dir}/code_descriptions_inductive_with_code_consistency.csv"
    df_out.to_csv(file_path, index=False)
}