In [None]:
import pandas as pd

# Load the memories from file
memories = pd.read_csv("../dummy_data/chatgpt_memories_msg_id.csv")
print(memories.shape)

In [None]:
import re

WS = r"(?:\s+|[,:;—-]\s*)"
PRON_ASSIST = r"(?:you|u|your)"  # addressing the assistant
PRON_USER = r"(?:my|mine|me)"    # user's info

# Verbs that strongly imply persistent storage
MEM_VERBS_STRONG = r"(?:remember|memorise|memorize|store|save|retain|record|log)"
# Softer phrasing that can be explicit with future cues
MEM_VERBS_SOFT = r"(?:keep(?:{0}in{0}mind)?|keep{0}track|note|make{0}a{0}note)".format(WS)

# Deletion/stop verbs
DEL_VERBS = r"(?:forget|erase|delete|remove|clear|purge|wipe|stop{0}(?:saving|storing|remembering))".format(WS)

# Preference/default verbs (proxy for memory)
PREF_VERBS = r"(?:set|update|change|make|treat)"
PREF_OBJECTS = r"(?:default(?:s)?|preference(?:s)?|profile|settings?)"

# Future-oriented cues
FUTURE_CUES = r"(?:from{0}now{0}on|going{0}forward|for{0}the{0}future|in{0}the{0}future|next{0}time|future{0}chat(?:s)?|future{0}conversation(?:s)?|always|by{0}default)".format(WS)

# Directive cues
DIRECTIVE_CUES = r"(?:please|kindly|can{0}you|could{0}you|i'?d{0}like{0}you{0}to|i{0}want{0}you{0}to|make{0}sure)".format(WS)

# Noun "memory" references (e.g., add to your memory)
MEM_NOUN = r"(?:to{0}your{0}memory|in{0}your{0}memory|into{0}memory|add{0}to{0}memory)".format(WS)

# Objects asked to be saved
SAVE_OBJECT = r"(?:{0}(?:\s+\w+){{0,5}})".format(PRON_USER)  # e.g., "my phone number", "my preference"

# ---------- exclusion patterns (no memory intent) ----------
EXCLUSIONS = [
    r"\bI{0}remember\b".format(WS),                      # "I remember…"
    r"\bI{0}can(?:not|n't){0}remember\b".format(WS, WS),  # "I can't remember…"
    r"\bremember{0}when\b".format(WS),                  # "remember when…"
    r"\bwhat\s+do\s+you\s+remember\b",                  # meta-questions
    r"\b(?:your|the)\s+memory\b.*\b(?:how|what|can|do)\b",  # "how does your memory work?"
]

EXCLUSION_RE = re.compile("|".join(EXCLUSIONS), re.I)

# ---------- explicit add ----------
# strong verb + (future cue OR memory noun OR direct request) and addresses assistant
EXPLICIT_ADD_PATTERNS = [
    rf"\b(?:{DIRECTIVE_CUES}{WS})?(?:{MEM_VERBS_STRONG}){WS}(?:that|this|it|{SAVE_OBJECT})\b(?:.*\b(?:{FUTURE_CUES}|{MEM_NOUN})\b)?",
    rf"\b(?:{DIRECTIVE_CUES}{WS})?(?:{MEM_VERBS_SOFT})\b(?:.*\b(?:{FUTURE_CUES}|{MEM_NOUN})\b)",
    rf"\badd{WS}(?:that|this|it|{SAVE_OBJECT}){WS}to{WS}your{WS}memory\b",
]

# ---------- explicit remove ----------
EXPLICIT_REMOVE_PATTERNS = [
    rf"\b(?:{DIRECTIVE_CUES}{WS})?(?:{DEL_VERBS})\b(?:.*\b(?:memory|that|this|it|{SAVE_OBJECT})\b)",
    rf"\b(?:disable|turn off){WS}(?:memory|saving|storing)\b",
    rf"\bplease{WS}forget\b",
]

# ---------- explicit update (defaults/preferences -> persistent state) ----------
EXPLICIT_UPDATE_PATTERNS = [
    rf"\b(?:{DIRECTIVE_CUES}{WS})?(?:{PREF_VERBS}){WS}(?:my{WS})?(?:{PREF_OBJECTS})\b",
    rf"\b(?:{DIRECTIVE_CUES}{WS})?(?:{PREF_VERBS}){WS}(?:{PRON_ASSIST}{WS})?(?:to|as){WS}(?:my{WS})?(?:default|preference)\b",
    rf"\bfrom{WS}now{WS}on\b.*\b(?:use|treat|assume|reply|answer)\b",
]

# ---------- ambiguous (soft cues without future cue) ----------
AMBIGUOUS_PATTERNS = [
    rf"\b(?:{DIRECTIVE_CUES}{WS})?(?:keep{WS}in{WS}mind|note{WS}that)\b(?!.*\b{FUTURE_CUES}\b)",
    rf"\b(?:it'?d|it{WS}would){WS}be{WS}(?:helpful|great){WS}if{WS}you\b.*\b(?:remember|keep{WS}in{WS}mind|note)\b(?!.*\b{FUTURE_CUES}\b)",
    # Ambiguous: just mentioning "memory" (not as a question or meta, and not in exclusions)
    r"\bmemory\b",
    r"\bremember\b",
    r"\bmemorize\b",
]

# For snippet extraction, flatten all patterns into a dict by intent type
PATTERNS_BY_INTENT = {
    "explicit_remove": EXPLICIT_REMOVE_PATTERNS,
    "explicit_update": EXPLICIT_UPDATE_PATTERNS,
    "explicit_add": EXPLICIT_ADD_PATTERNS,
    "ambiguous": AMBIGUOUS_PATTERNS,
}


def classify_memory_intent_and_snippet(user_text: str):
    """
    Returns (intent, snippet) where snippet is the matched substring (or None).
    """
    if not user_text or EXCLUSION_RE.search(user_text):
        return "none", None

    # Order: removals > updates > adds > ambiguous
    for intent in ["explicit_remove", "explicit_update", "explicit_add", "ambiguous"]:
        for pattern in PATTERNS_BY_INTENT[intent]:
            m = re.search(pattern, user_text, re.I)
            if m:
                # Return the matched substring (snippet)
                # If the match is very short, try to expand to a few words around it for context
                snippet = m.group(0)
                if len(snippet) < 10:
                    # Try to expand to 5 words before/after
                    words = user_text.split()
                    idx = user_text.lower().find(snippet.lower())
                    if idx != -1:
                        # Find word boundaries
                        before = user_text[:idx].split()[-5:] if user_text[:idx] else []
                        after = user_text[idx+len(snippet):].split()[:5] if user_text[idx+len(snippet):] else []
                        snippet = " ".join(before + [snippet] + after)
                return intent, snippet
    return "none", None

# Apply to dataframe
memories[["memory_intent", "memory_intent_snippet"]] = memories["User Message"].apply(
    lambda x: pd.Series(classify_memory_intent_and_snippet(x))
)
memories["memory_intent"].value_counts()


In [None]:
import seaborn as sns

# Set seaborn style
sns.set_theme(style="whitegrid", context="paper", font_scale=1.2)
sns.set_palette("colorblind")  # paper palette, colorblind-friendly

# Matplotlib settings
import matplotlib as mpl
mpl.rcParams['axes.labelweight'] = 'bold'
mpl.rcParams['axes.titlesize'] = 'large'

mpl.rcParams['xtick.labelsize'] = 'large'
mpl.rcParams['ytick.labelsize'] = 'large'
mpl.rcParams['legend.fontsize'] = 'large'
mpl.rcParams['figure.dpi'] = 100
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['axes.labelsize'] = 'x-large'
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif']


In [None]:
from matplotlib import pyplot as plt

# First, compute the general distribution: intent found vs not found
intent_presence_counts = memories["memory_intent"].apply(lambda x: "User" if x != "none" else "ChatGPT").value_counts()
intent_presence_percent = intent_presence_counts / intent_presence_counts.sum() * 100

# Barplot 1: General distribution (Intent Found vs No Intent)
fig1, ax1 = plt.subplots(figsize=(4, 3))
bars1 = ax1.bar(intent_presence_percent.index, intent_presence_percent.values, color=["#6E7B8B", "#4C72B0"])
ax1.set_ylabel("Percentage")
ax1.set_ylim(0, 110)
ax1.tick_params(axis='x', labelsize=16)

# Add percentage and absolute number labels on top of bars
for bar, label, percent, count in zip(
    bars1, 
    intent_presence_percent.index, 
    intent_presence_percent.values, 
    intent_presence_counts.values
):
    ax1.text(
        bar.get_x() + bar.get_width() / 2, 
        bar.get_height() + 1, 
        f"{percent:.1f}%\n(n={count})", 
        ha='center', va='bottom', fontsize=12, fontweight='bold'
    )

plt.tight_layout()
plt.savefig("explicit_vs_not.pdf")


# --- For how many users a initiated memory was found ---

# For each user, check if they have at least one initiated memory (memory_intent != "none")
user_has_initiated = memories.groupby("user_id")["memory_intent"].apply(lambda x: (x != "none").any())
num_users_with_initiated = user_has_initiated.sum()
num_users_total = user_has_initiated.shape[0]
num_users_not_initiated = num_users_total - num_users_with_initiated

fig3, ax3 = plt.subplots(figsize=(4, 3))
bars3 = ax3.bar(
    ["ChatGPT", "User"],
    [num_users_not_initiated, num_users_with_initiated],
    color=["#6E7B8B", "#4C72B0"]
)
ax3.set_ylabel("Number of Users")
ax3.set_ylim(0, num_users_total + 2)
ax3.tick_params(axis='x', labelsize=16)

# Add count and percent labels on top of bars
for bar, count in zip(bars3, [num_users_not_initiated, num_users_with_initiated]):
    percent = count / num_users_total * 100
    ax3.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 0.5,
        f"{percent:.1f}%\n(n={count})",
        ha='center', va='bottom', fontsize=12, fontweight='bold'
    )

plt.tight_layout()
plt.savefig("users_with_initiated_memory.pdf")
