## Installing Dependences and imports

In [None]:
!pip install -q kaggle datasets spacy transformers
!python -m spacy download en_core_web_lg
!pip install fastcoref -q
!pip install negspacy -q
!pip install flair
!pip install pyvis

In [None]:
import os
import re
import torch
import numpy as np
import pandas as pd
import kagglehub
import spacy
from tqdm.auto import tqdm
from tqdm import tqdm
from negspacy.negation import Negex
from negspacy.termsets import termset
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from fastcoref import spacy_component
from flair.models import SequenceTagger
from flair.data import Sentence
from sklearn.decomposition import PCA
from google.colab import drive
from collections import Counter
from spacy.matcher import Matcher
from datasets import load_dataset
from google.colab import files

## Setting up Kaggle API

In [None]:
uploaded = files.upload()
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Loading Dataset 1 (Hugging face)

In [None]:
!wget https://huggingface.co/datasets/halilbabacan/autotrain-data-cognitive_distortions/resolve/main/raw/Cognitive_distortions.csv

# Load and structure the data
df1 = pd.read_csv("Cognitive_distortions.csv")
df1 = df1.rename(columns={
    'Text': 'Patient Question',
    'Label': 'Dominant Distortion'
})
df1.insert(1, "Distorted part", value=np.nan)
df1.insert(3, "Secondary Distortion (Optional)", value=np.nan)

df1.head()

In [None]:
for col in ["Distorted part", "Dominant Distortion", "Secondary Distortion (Optional)"]:
  print(f"Unique values in '{col}':")
  if col in df1.columns:
    print(df1[col].unique())
  else:
    print(f"Column '{col}' not found in df1.")
  print("-" * 30)

## Load Dataset 2 (Kaggle)

In [None]:
multiclass_dataset_path = kagglehub.dataset_download(
    "sagarikashreevastava/cognitive-distortion-detetction-dataset"
)
print("Path to dataset files:", multiclass_dataset_path)
multiclass_dataset_file_path = multiclass_dataset_path + "/Annotated_data.csv"

### Cleaning & Structuring df2

In [None]:
df2 = pd.read_csv(multiclass_dataset_file_path)
df2 = df2.drop('Id_Number', axis=1) # deleting column with id
df2

## Concatenate into df3

In [None]:
df3 = pd.concat([df1, df2], ignore_index=True)
df3

In [None]:
for col in ["Distorted part", "Dominant Distortion", "Secondary Distortion (Optional)"]:
  print(f"Unique values in '{col}':")
  if col in df3.columns:
    print(df3[col].unique())
  else:
    print(f"Column '{col}' not found in df1.")
  print("-" * 30)

In [None]:
df3 = pd.concat([df1.drop(columns=["Distorted part", "Secondary Distortion (Optional)"]), df2], ignore_index=True)
df3 = df3.rename(columns={"Patient Question": "text"})
df3 = df3.reset_index().rename(columns={"index": "id"})

print(" Dataset 1 and Dataset 2 loaded and combined into df3.")
print(df3.head())

In [None]:
# prompt: Renmae 'Distortion' in dominant disrotion column in df3

df3['Dominant Distortion'] = df3['Dominant Distortion'].replace('Distortion', 'General Distortion')

In [None]:
# prompt: dominant distortion unique values

print(df3['Dominant Distortion'].unique())

#### Installing and Loading fastcoref


In [None]:
gpu_available = torch.cuda.is_available()

if gpu_available:
    print(" GPU is available")
else:
  print("CPU")

#### Applying Coref Resolution to Texts

In [None]:
from fastcoref import FCoref
import spacy

# Load spaCy's lg model (for dependency parsing/NER)
nlp = spacy.load("en_core_web_lg")

coref_model = FCoref(device='cuda')

# Process data
predictions = coref_model.predict(df3["text"].tolist())
resolved_texts = [p.text for p in predictions]
df3["resolved_text"] = resolved_texts

docs = list(nlp.pipe(df3["resolved_text"], batch_size=16))

In [None]:
texts = df3["resolved_text"].tolist()

#### Manually Verifying Coref Accuracy

In [None]:
sample_check = df3[["text", "resolved_text"]].sample(10)
display(sample_check)

#Handling Negation in Relationship Triples

In [None]:
import spacy
from tqdm import tqdm
import pandas as pd

# Initialize negation detection pipeline
nlp_neg = spacy.load("en_core_web_lg", exclude=["parser", "coref"])
nlp_neg.add_pipe("sentencizer")
nlp_neg.add_pipe("negex", last=True)

# Process texts in batches
neg_results_list = []
batch_size = 8
texts = df3["resolved_text"].tolist()

for doc in tqdm(nlp_neg.pipe(texts, batch_size=batch_size),
                total=len(texts),
                desc="Detecting negations"):
    # Extract negation information
    neg_info = {
        "negated_entities": [(ent.text, ent.label_, ent._.negex)
                            for ent in doc.ents if ent._.negex],
        "has_negation": any(ent._.negex for ent in doc.ents)
    }
    neg_results_list.append(neg_info)

# Add results to DataFrame
neg_results_df = pd.DataFrame(neg_results_list)
df3 = pd.concat([df3, neg_results_df], axis=1)

# Extracting triples from text column





In [None]:
import spacy
from flair.models import SequenceTagger
from flair.data import Sentence
import pandas as pd

# Initialize models
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('sentencizer')
flair_tagger = SequenceTagger.load("flair/ner-english").to('cuda')

SOCIAL_RELATION_VERBS = {
    "like", "love", "hate", "trust", "distrust", "fear", "admire", "resent",
    "blame", "support", "oppose", "befriend", "avoid", "confide", "believe",
    "doubt", "respect", "despise", "envy", "forgive", "help", "betray", "accept",
    "teach", "raise", "parent", "marry", "date", "partner", "understand",
    "open", "connect", "relate", "abuse", "neglect", "criticize", "accuse",
    "reject", "abandon", "protect", "depend", "rely", "dislike", "miss",
    "care", "cherish"
}

sample_texts = [
    "John trusts Mary but fears hospitals. He is the son of Robert.",
    "Sarah supports Emily and works at Google.",
    "The patient denies having any pain or discomfort."
]

def debug_relationship_extraction(text):
    doc = nlp(text)
    flair_sent = Sentence(text)
    flair_tagger.predict([flair_sent])

    print(f"\nProcessing text: {text}")
    print("\nTokens and POS tags:")
    for token in doc:
        print(f"{token.text} ({token.pos_}, lemma: {token.lemma_})")

    print("\nFlair entities:")
    for ent in flair_sent.get_spans('ner'):
        print(f"{ent.text} ({ent.tag})")

    print("\nPotential relationships:")
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "VERB" and token.lemma_ in SOCIAL_RELATION_VERBS:
                print(f"\nFound relationship verb: {token.text} (lemma: {token.lemma_})")
                subjects = extract_arguments(token, sent, "subj")
                objects = extract_arguments(token, sent, "obj")
                print(f"Subjects: {subjects}")
                print(f"Objects: {objects}")

def extract_arguments(token, sent, arg_type):
    targets = set()
    deps = {"nsubj", "nsubjpass"} if arg_type == "subj" else {"dobj", "pobj"}

    for child in token.children:
        if child.dep_ in deps:
            targets.add(child.text)
        elif child.dep_ == "prep":
            for pobj in child.children:
                if pobj.dep_ == "pobj":
                    targets.add(pobj.text)

    return list(targets)

# Debug each sample text
for text in sample_texts:
    debug_relationship_extraction(text)

In [None]:
from tqdm.auto import tqdm
import pandas as pd
from fastcoref import FCoref
import spacy
from flair.models import SequenceTagger
from flair.data import Sentence
from spacy.matcher import Matcher

tqdm.pandas()

nlp = spacy.load("en_core_web_lg")

coref_model = FCoref(device='cuda')
predictions = coref_model.predict(df3["text"].tolist())
resolved_texts = [p.text for p in predictions]
df3["resolved_text"] = resolved_texts

# Define the RelationshipExtractor class
class RelationshipExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")
        self.nlp.add_pipe('sentencizer')
        self.flair_tagger = SequenceTagger.load("flair/ner-english").to('cuda')
        self.matcher = Matcher(self.nlp.vocab)

        self.SOCIAL_RELATION_VERBS = {
            "like", "love", "hate", "trust", "distrust", "fear", "admire", "resent",
            "blame", "support", "oppose", "befriend", "avoid", "confide", "believe",
            "doubt", "respect", "despise", "envy", "forgive", "help", "betray", "accept",
            "teach", "raise", "parent", "marry", "date", "partner", "understand",
            "open", "connect", "relate", "abuse", "neglect", "criticize", "accuse",
            "reject", "abandon", "protect", "depend", "rely", "dislike", "miss",
            "care", "cherish"
        }

        # Add a pattern for verbs that are social relation verbs
        verb_pattern = [{"POS": "VERB", "LEMMA": {"IN": list(self.SOCIAL_RELATION_VERBS)}}]
        self.matcher.add("SOCIAL_VERB", [verb_pattern])

    def extract_relationships(self, texts):
        all_relationships = []
        docs = list(self.nlp.pipe(texts, disable=["ner", "textcat"]))

        for doc in docs:
            relationships_in_doc = []
            for sent in doc.sents:
                # Find potential relationship verbs using the matcher
                matches = self.matcher(sent)
                for match_id, start, end in matches:
                    # Access the token from the span
                    verb_token = sent[start:end][0]
                    relationships_in_doc.extend(self._extract_from_verb(verb_token, sent))
            all_relationships.append(relationships_in_doc)
        return all_relationships

    def _extract_from_verb(self, verb_token, sent):
        relationships = []
        subjects = self._find_subjects(verb_token, sent)
        objects = self._find_objects(verb_token, sent)

        for subj in subjects:
            for obj in objects:
                relationships.append({
                    "subject": subj.text,
                    "relation": verb_token.lemma_,
                    "object": obj.text,
                    "context": sent.text,
                })
        return relationships

    def _find_subjects(self, verb_token, sent):
        subjects = []
        for child in verb_token.children:
            if child.dep_ in ("nsubj", "nsubjpass"):
                subjects.append(child)
        # Handle coordinating conjunctions (e.g., "John and Mary like pizza")
        for subj in subjects:
            for right_sibling in subj.rights:
                if right_sibling.dep_ == "conj":
                    subjects.append(right_sibling)
        return subjects

    def _find_objects(self, verb_token, sent):
        objects = []
        for child in verb_token.children:
            if child.dep_ in ("dobj", "pobj"):
                objects.append(child)
            # Handle prepositional phrases
            elif child.dep_ == "prep":
                 for grand_child in child.children:
                     if grand_child.dep_ == "pobj":
                        objects.append(grand_child)

        # Handle coordinating conjunctions (e.g., "John likes pizza and pasta")
        for obj in objects:
            for right_sibling in obj.rights:
                if right_sibling.dep_ == "conj":
                    objects.append(right_sibling)

        return objects


extractor = RelationshipExtractor()

# Extract relationships from the resolved text
df3["relationships"] = df3["resolved_text"].progress_apply(lambda x: extractor.extract_relationships([x])[0] if extractor.extract_relationships([x]) else [])

##  flattening and integration

In [None]:
# Initialize a list to hold detailed relationship rows
detailed_records = []

# Iterate over df3 to extract and flatten multiple relationships
for _, row in df3.iterrows():
    text_id = row["id"]
    text = row["text"]
    dominant_distortion = row["Dominant Distortion"]
    secondary_distortion = row["Secondary Distortion (Optional)"]
    relationships = row["relationships"]

    if isinstance(relationships, list):
        for rel in relationships:
            subj = rel.get("subject")
            rel_verb = rel.get("relation")
            obj = rel.get("object")
            context = rel.get("context")
            neg_flag = rel.get("negated", False)

            if subj and rel_verb and obj:
                detailed_records.append({
                    "text_id": text_id,
                    "context": context,
                    "subject": subj,
                    "relation": rel_verb,
                    "object": obj,
                    "negated": neg_flag,
                    "dominant_distortion": dominant_distortion,
                    "secondary_distortion": secondary_distortion
                })

# Create DataFrame from all extracted triples
triples_df = pd.DataFrame(detailed_records)

# Reset index and assign an ID column
if not triples_df.empty:
    triples_df.reset_index(inplace=True)
    triples_df.rename(columns={"index": "id"}, inplace=True)

# Preview
print(" Extracted all relationships into triples_df")
display(triples_df.head())

## Combined Solution (Summary + Detailed Triples)

In [None]:
# 1. Create Summary DataFrame (id, text, list-of-relationships)
summary_df = df3[["id", "text"]].copy()
summary_df["list-of-relationships"] = df3["relationships"]

# 2. Create Detailed Relationships DataFrame
detailed_records = []
for _, row in df3.iterrows():
    for relationship in row["relationships"]:
        subj = relationship.get('subject')
        rel = relationship.get('relation')
        obj = relationship.get('object')

        if subj and rel and obj:
             detailed_records.append({
                "text_id": row["id"],
                "context": row["text"],  # Using full text as context
                "relationship": (rel, subj, obj)
            })

triples_df = pd.DataFrame(detailed_records)
if not triples_df.empty:
    triples_df.reset_index(inplace=True)
    triples_df.rename(columns={"index": "id"}, inplace=True)

print("Summary DataFrame:")
display(summary_df.head())

print("\nTriples DataFrame:")
display(triples_df.head())

In [None]:
# Count number of relationships in each row
df3["n_relationships"] = df3["relationships"].apply(lambda x: len(x) if isinstance(x, list) else 0)

### Plot Histogram of Relationship Counts

In [None]:
relation_counts = triples_df['relation'].value_counts()

sorted_relation_counts = relation_counts.sort_values(ascending=False)

top_20_relations = sorted_relation_counts.head(20)

# Visualize the statistics using a horizontal bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x=top_20_relations.values, y=top_20_relations.index, orient='h', palette='viridis')
plt.title('Top 20 Most Frequent Relations in triples_df', fontsize=16)
plt.xlabel('Number of Occurrences', fontsize=12)
plt.ylabel('Relation', fontsize=12)
plt.tight_layout()
plt.show()

###  Print Sample Texts Based on Triple Count

In [None]:
# No triples extracted
print("\n No triples extracted:")
display(df3[df3["n_relationships"] == 0][["text", "resolved_text"]].sample(3))

# Exactly 1 triple
print("\n Exactly 1 triple:")
display(df3[df3["n_relationships"] == 1][["text", "resolved_text", "relationships"]].sample(3))

# More than 2 triples
print("\n Multiple triples:")
display(df3[df3["n_relationships"] >= 2][["text", "resolved_text", "relationships"]].sample(3))


# Clustering Relationships by Verb Type (Using Embeddings + Clustering)

In [None]:
# Extract only unique verbs (relations) from the triples
triples_df["verb"] = triples_df["relation"].apply(lambda x: x if isinstance(x, str) else None)
# Rename the 'verb' column to 'relation'
triples_df = triples_df.rename(columns={'verb': 'relation_text'}) # Renaming to relation_text to avoid conflict with the original relation column

unique_verbs = sorted(triples_df["relation_text"].dropna().unique())
print(f"Found {len(unique_verbs)} unique verbs:", unique_verbs)

In [None]:
triples_df = triples_df.rename(columns={'verb': 'relation'})
print(triples_df.head())

#### Convert Verbs to Vectors Using Pretrained Model


In [None]:
nlp_lg = spacy.load("en_core_web_lg")
verb_vectors = np.array([nlp_lg(verb).vector for verb in unique_verbs])

#### Apply Clustering (KMeans)


In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(verb_vectors)

# Map verbs to cluster IDs
verb_cluster_map = {verb: int(cluster) for verb, cluster in zip(unique_verbs, clusters)}

#### Visualizing Clusters

In [None]:
# Reduce dimensions for plotting
pca = PCA(n_components=2)
verb_vec_2d = pca.fit_transform(verb_vectors)

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=verb_vec_2d[:, 0], y=verb_vec_2d[:, 1], hue=clusters, palette="tab10", s=100)

# Annotate points
for i, verb in enumerate(unique_verbs):
    plt.text(verb_vec_2d[i, 0] + 0.01, verb_vec_2d[i, 1], verb, fontsize=9)

plt.title("relation Clustering with KMeans + PCA")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.grid(True)
plt.show()

#### mapping each verb to it cluster and return a list of verbs to each cluster

In [None]:
cluster_to_verbs = {}
for verb, cluster_id in verb_cluster_map.items():
    if cluster_id not in cluster_to_verbs:
        cluster_to_verbs[cluster_id] = []
    cluster_to_verbs[cluster_id].append(verb)

# Print the list of verbs for each cluster
print("\nRelations assigned to each cluster:")
for cluster_id, verbs_list in cluster_to_verbs.items():
    print(f"Cluster {cluster_id}: {verbs_list}")

#### mapping clusters to semantic categories

In [None]:
# Define manual labels based on cluster inspection
relation_cluster_labels = {
    0: "Supportive Trust",
    1: "Emotional Evaluation",
    2: "Harmful Actions",
    3: "Social Tension",
    4: "Bond Alteration"
}
# Add cluster label to each row in triples_df
triples_df["relation_cluster"] = triples_df["relation"].map(verb_cluster_map)
triples_df["relation_cluster_label"] = triples_df["relation_cluster"].map(verb_cluster_labels)

In [None]:
triples_df.head()

In [None]:
triples_df['Dominant Distortion'] = triples_df['text_id'].map(df3.set_index('id')['Dominant Distortion'])
triples_df['Secondary Distortion (Optional)'] = triples_df['text_id'].map(df3.set_index('id')['Secondary Distortion (Optional)'])

# Display the updated triples_df
print("Triples DataFrame with Distortion columns:")
display(triples_df.head())
print(len(triples_df))

### Saving the result

In [None]:
# prompt: save triples_df in my drive in My Research folder
drive.mount('/content/drive')
triples_df.to_csv('/content/drive/My Drive/My Research/triples_df.csv', index=False)
print("triples_df saved to Google Drive.")

In [None]:
# 1. Install required tools
!pip install -q nbstripout jq

# 3. Set correct paths (CHANGE THESE IF NEEDED)
input_path = "/content/drive/MyDrive/Colab Notebooks/Clustering_Relation_Verbs.ipynb"
output_path = "/content/cleaned_notebook.ipynb"

# 4. Clean the notebook (3-step process)
# Step 1: Copy to Colab's working directory
!cp "{input_path}" /content/

# Step 2: Clean outputs and widgets
!jupyter nbconvert --ClearOutputPreprocessor.enabled=True --to notebook --output temp.ipynb /content/Clustering_Relation_Verbs.ipynb
!nbstripout temp.ipynb

# Step 3: Move to final location
!mv temp.ipynb "{output_path}"

# 5. Verify cleaning worked
!jq .metadata.widgets "{output_path}"  # Should return "null"

# 6. Download cleaned file
from google.colab import files
files.download(output_path)