## 1. Load dependencies and data

We are using the following libraries:
- `pandas` for data manipulation
- `spacy` for natural language processing
- `nltk` for natural language processing
- `scipy` for sparse matrix handling
- `sklearn` for dimensionality reduction
- `sentence_transformers` for sentence embeddings

In [2]:
import pandas as pd
import spacy
import nltk

nlp = spacy.load("en_core_web_trf")

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gyevn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
explanations = pd.read_json('explanations.json', orient='index')  # Load explanations data
docs = list(nlp.pipe(explanations['Text']))  # Process all explanations
explanations.head()

Unnamed: 0,PID,Text,QuestionType,SID
1,55b6a2e5fdf99b350d57360e,It was too far over to the left and was blocke...,Teleological,1
2,5f6f88cdb1ca47522ca4eef3,It needed to exit.,Teleological,1
3,5eb4442bcaffba2864f4c554,The car needed to get over so he moved quickly...,Teleological,1
4,60965962e523358592059dcb,"It wasn't in the correct lane, didn't get the ...",Teleological,1
5,5a440abd1ba496000102a238,It was in the wrong lane and had to make a qui...,Teleological,1


## 2. Calculate word and sentence level statistics

This includes the number of sentences, tokens, characters, words, and the minimum and maximum dependency depth of the dependency tree.

In [4]:
def to_nltk_tree(node) -> nltk.Tree:
    label = f"{node.orth_} ({node.dep_})"
    if node.n_lefts + node.n_rights > 0:
        return nltk.Tree(label, [to_nltk_tree(child) for child in node.children])
    else:
        return label
    
def dependency_lengths(node):
    if node.n_lefts + node.n_rights > 0:
        lens = []
        for child in node.children:
            if not child.is_punct:
                lens.append(abs(node.i - child.i))
                lens.extend(dependency_lengths(child))
        return lens
    else:
        return []    


In [5]:
from collections import Counter

output = []
embedding = {}

for i, doc in enumerate(docs):
    # Dependency tree height
    dependency_trees = []
    for sent in doc.sents:
        tree = to_nltk_tree(sent.root)
        if not isinstance(tree, str):
            dependency_trees.append(tree)
    tree_heights = [tree.height() for tree in dependency_trees]
    
    # Dependency graph for dependency length calculation
    dep_lens = []
    for sent in doc.sents:
        dep_lens.extend(dependency_lengths(sent.root))
    
    lemmas = [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stop_words]
    
    output.append({
        'NumSentences': len(list(doc.sents)),
        'NumTokens': len(doc),
        'NumChars': sum([len(sent.text) for sent in doc.sents]),
        "NumWords": sum([len(sent.text.split()) for sent in doc.sents]),
        "MinDepDepth": min(tree_heights),
        "MeanDepDepth": sum(tree_heights) / len(tree_heights),
        "MaxDepDepth": max(tree_heights),
        "MeanDepLength": sum(dep_lens) / len(dep_lens),
        "MaxDepLength": max(dep_lens), 
        "DependencyTrees": [" ".join(str(tree).split()) for tree in dependency_trees],
        "Type": explanations.loc[i+1, 'QuestionType'],
        "Text": doc.text,
        "WordCounts": dict(Counter(lemmas))
    })


Unnamed: 0,NumSentences,NumTokens,NumChars,NumWords,MinDepDepth,MeanDepDepth,MaxDepDepth,MeanDepLength,MaxDepLength,DependencyTrees,Type,Text,WordCounts
1,2,37,162,35,5,5.0,5,2.484848,10,[(was (ROOT) It (nsubj) (to (prep) (over (advm...,Teleological,It was too far over to the left and was blocke...,"{'far': 1, 'left': 1, 'block': 1, 'two': 1, 'w..."
2,1,5,18,4,3,3.0,3,1.333333,2,[(needed (ROOT) It (nsubj) (exit (xcomp) to (a...,Teleological,It needed to exit.,"{'need': 1, 'exit': 1}"
3,2,30,133,28,4,5.5,7,1.846154,6,[(needed (ROOT) (car (nsubj) The (det)) (get (...,Teleological,The car needed to get over so he moved quickly...,"{'car': 2, 'need': 1, 'get': 1, 'move': 1, 'qu..."
4,1,22,97,18,7,7.0,7,2.263158,9,[(was (ROOT) It (nsubj) n't (neg) (in (prep) (...,Teleological,"It wasn't in the correct lane, didn't get the ...","{'correct': 2, 'lane': 2, 'get': 2, 'prompt': ..."
5,1,17,69,16,5,5.0,5,2.6,6,[(was (ROOT) It (nsubj) (in (prep) (lane (pobj...,Teleological,It was in the wrong lane and had to make a qui...,"{'wrong': 1, 'lane': 2, 'make': 1, 'quick': 1,..."


In [17]:
from src.get_anthroscore import get_text_score
import tqdm

import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# Get the AnthropoScore for each agent-related term in the explanation
entities = ["car", "self-driving car", "vehicle", "agent", "it"]
scores = []
with HiddenPrints():
    for text in tqdm.tqdm(output["Text"]):
        scores.append(get_text_score(text, entities))
output["AnthroScore"] = scores

100%|██████████| 1308/1308 [06:06<00:00,  3.57it/s]


In [18]:
# Save output to JSON
output = pd.DataFrame(output, index=explanations.index)
output.to_json("linguistic_features.json", orient="index", indent=4)
output.head(10)

Unnamed: 0,NumSentences,NumTokens,NumChars,NumWords,MinDepDepth,MeanDepDepth,MaxDepDepth,MeanDepLength,MaxDepLength,DependencyTrees,Type,Text,WordCounts,Scores,AnthroScore
1,2,37,162,35,5,5.0,5,2.484848,10,[(was (ROOT) It (nsubj) (to (prep) (over (advm...,Teleological,It was too far over to the left and was blocke...,"{'far': 1, 'left': 1, 'block': 1, 'two': 1, 'w...",1,-1.119358
2,1,5,18,4,3,3.0,3,1.333333,2,[(needed (ROOT) It (nsubj) (exit (xcomp) to (a...,Teleological,It needed to exit.,"{'need': 1, 'exit': 1}",1,3.246491
3,2,30,133,28,4,5.5,7,1.846154,6,[(needed (ROOT) (car (nsubj) The (det)) (get (...,Teleological,The car needed to get over so he moved quickly...,"{'car': 2, 'need': 1, 'get': 1, 'move': 1, 'qu...",1,5.815513
4,1,22,97,18,7,7.0,7,2.263158,9,[(was (ROOT) It (nsubj) n't (neg) (in (prep) (...,Teleological,"It wasn't in the correct lane, didn't get the ...","{'correct': 2, 'lane': 2, 'get': 2, 'prompt': ...",1,4.585336
5,1,17,69,16,5,5.0,5,2.6,6,[(was (ROOT) It (nsubj) (in (prep) (lane (pobj...,Teleological,It was in the wrong lane and had to make a qui...,"{'wrong': 1, 'lane': 2, 'make': 1, 'quick': 1,...",1,6.09778
6,1,9,32,7,4,4.0,4,1.714286,3,[(want (ROOT) It (nsubj) did (aux) n't (neg) (...,Teleological,It didn't want to miss the exit.,"{'want': 1, 'miss': 1, 'exit': 1}",1,6.189758
7,1,24,120,21,6,6.0,6,2.666667,7,[(determined (ROOT) (car (nsubj) The (det) blu...,Teleological,The blue self-driving car determined that the ...,"{'blue': 1, 'self': 1, 'drive': 1, 'car': 2, '...",1,-1.034934
8,1,36,155,35,10,10.0,10,2.264706,9,[(needs (ROOT) It (nsubj) (get (xcomp) to (aux...,Teleological,It needs to get to the offramp and its program...,"{'need': 1, 'get': 2, 'offramp': 1, 'programmi...",1,-2.595084
9,1,20,95,19,6,6.0,6,2.333333,8,[(took (ROOT) It (nsubj) (actions (dobj) these...,Teleological,It took these actions because the exit ramp wa...,"{'take': 1, 'action': 1, 'exit': 1, 'ramp': 1,...",1,2.295882
10,2,59,276,55,6,6.5,7,2.537037,16,[(took (ROOT) (car (nsubj) The (det) blue (amo...,Teleological,The blue car took these actions because it wan...,"{'blue': 1, 'car': 2, 'take': 1, 'action': 1, ...",1,-0.850519


## 3. Visualize sentence-level sparse and dense embeddings

For sparse embeddings, we use a TF-IDF vectorizer to create a sparse matrix of the text data. We then reduce the dimensionality of the matrix using TruncatedSVD and TSNE and plot the data colored by the question type.

For dense embeddings, we use the `sentence_transformers` library to create sentence embeddings and reduce the dimensionality of the embeddings using PCA and TSNE. We then plot the data colored by the question type.

You can however over the points to see the text of the explanation.

In [52]:
from scipy.sparse import issparse
from umap import UMAP
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sentence_transformers import SentenceTransformer

%matplotlib qt
import matplotlib.pyplot as plt
import plotly.express as px

def create_embedding(X, dense=False):
    if not dense:
        if issparse(X):
            X_embedded = TruncatedSVD(n_components=25).fit_transform(X)
        else:
            X_embedded = PCA(n_components=25).fit_transform(X)
        X_embedded = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(X_embedded)
    else:
        sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
        X_embedded = sentence_model.encode(X)
    return X_embedded

In [53]:
# Create an embedding matrix from the text based on the tf-idf score of each term
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(output["Text"])

# Reduce the dimensionality of the embedding matrix and plot it colored by the question type
X_embedded = create_embedding(X.toarray())
fig = px.scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], color=output['Type'], hover_data={"Text": output["Text"]})
fig.show()

In [55]:
# Create a sparse embedding matrix from the dictionary counts
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(output["Text"])
word_counts = pd.DataFrame(X.sum(axis=0), columns=vectorizer.get_feature_names_out())

# Plot wordcloud for word counts
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_counts.iloc[0].to_dict())
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Embed the lemma counts into 2D space and plot it by the question type
X_counts_embedded = create_embedding(X)
px.scatter(x=X_counts_embedded[:, 0], y=X_counts_embedded[:, 1], color=output['Type'], hover_data={"Text": output["Text"]}).show()

In [56]:
# Create a sentence-bert embedding matrix from the text
embedded = create_embedding(output["Text"], dense=True)
embedded = create_embedding(embedded, dense=False)
fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], color=output['Type'], hover_data={"Text": output["Text"]})
fig.show()

## 4. Perform sentence embedding for parse trees to compare similarities

This is done by taking the flattened tree structure of each dependency parse tree and running it through sentence-bert then applying the same dimensionality reduction and plotting as before.

In [57]:
embedded = create_embedding(output["DependencyTrees"], dense=True)
embedded = create_embedding(embedded, dense=False)
fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], color=output['Type'], hover_data={"Text": output["Text"]})
fig.show()

## 5. Topic analysis with BERTopic

In [63]:
from bertopic import BERTopic
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer()
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(output["Text"], show_progress_bar=False)

# Train BERTopic
topic_model = BERTopic(ctfidf_model=ctfidf_model).fit(output["Text"], embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(output["Text"], reduced_embeddings=reduced_embeddings)

In [64]:
# Visual hierarchy of topics
hierarchical_topics = topic_model.hierarchical_topics(output["Text"])
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()

100%|██████████| 28/28 [00:00<00:00, 285.55it/s]
