In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer
from scipy.sparse import csr_matrix, lil_matrix

## Data

In [None]:
df_dreams = pd.read_csv("../data/dreams/dreams_syuzhet_df.csv")  # from https://github.com/SJD1882/IGR204-DataViz-Dream-Bank-Project/blob/master/notebooks/Dream_Bank_Data_Preprocessing.ipynb
df_dreams.head()

### Dataviz

In [None]:
plt.figure(figsize=(12, 6))
df_dreams["dreamer"].value_counts().plot(kind="bar")
plt.xlabel("Dreamer")
plt.ylabel("Count")
plt.title("Dreamer Count")
plt.show()

In [None]:
df_dreams["text_cleaned"].apply(lambda x: len(str(x).split())).plot(kind="hist", bins=50, figsize=(10,5))
plt.xlabel("Number of Words in text_cleaned")
plt.ylabel("Frequency")
plt.title("Distribution of Word Counts in text_cleaned")
plt.show()


## model

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", max_length=None)

# Staged

In [None]:
def create_wordByDocMatrix(l_inputs, vocab_size, verbose=False):
    matrix_wbd = lil_matrix(np.zeros((vocab_size, len(l_inputs)), dtype=int))
    for i, input in enumerate(l_inputs):
        for id_token in input:
            matrix_wbd[id_token,i] = matrix_wbd[id_token,i]+1
        if verbose and i % 1 == 0:
            print(f"Processed {i} texts")
    matrix_wbd = (matrix_wbd>=1).astype(int)
    return matrix_wbd

def create_wordByWordAdjacency(matrix_wbd, k=0):
    matrix_wbw = (matrix_wbd @ matrix_wbd.T).astype(np.float32)
    a = matrix_wbw.diagonal()
    mask = a!=0
    a[mask] = 1/(a[mask]+k)
    adjacency = csr_matrix(np.diag(a))@matrix_wbw
    sparsity = adjacency.count_nonzero()/np.multiply(*adjacency.shape)
    print(f"Created adjacency with sparsity: {sparsity}")
    return adjacency

def preprocess_texts(l_texts, tokenizer, k=0):
    l_inputs = [tokenizer(x, truncation=False, padding=False)["input_ids"][1:-1] for x in l_texts]
    matrix_wbd = create_wordByDocMatrix(l_inputs, tokenizer.vocab_size)
    adjacency = create_wordByWordAdjacency(matrix_wbd, k=k)
    return adjacency

def convert_toEdgeIndex(adjacency):
    dst = adjacency.indices
    a = np.arange(len(adjacency.indices))
    l = []
    for i, x in enumerate(a):
       l.append(np.nonzero(adjacency.indptr>x)[0][0]-1)
       if i % 10000 == 0:
           print(f"Processed {i} edges over {len(a)}")
    src = np.array(l)
    return src, dst
 

# unit tests
l_texts = [
    "chicken rice",
    "chicken chicken",
    "rice noodles",
    "noodles soup",
    "noodles rice"
]

adjacency = preprocess_texts(l_texts, tokenizer, k=0)
mask = adjacency.diagonal()!=0
print(np.nonzero(mask))
print(tokenizer.decode(np.nonzero(mask)[0].tolist()))
adjacency = adjacency[mask,:][:,mask]
print(adjacency.toarray())

M = np.zeros_like(adjacency.toarray())
src, dst = convert_toEdgeIndex(adjacency)
for s, d in zip(src, dst):
    M[s,d] = adjacency[s,d]
assert (M==adjacency.toarray()).all()

In [None]:
df_dreams = df_dreams.loc[df_dreams["dreamer"] == "dorothea"].reset_index(drop=True)
l_texts = df_dreams["text_cleaned"].tolist()
adjacency = preprocess_texts(l_texts, tokenizer, k=1)

In [None]:
a = adjacency[(adjacency!=0)].A1
a = a[a!=1]
# plot the off diagonal value of adjacency
plt.figure(figsize=(10, 5))
plt.hist(a, bins=50, color='skyblue', edgecolor='black')
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.title("Distribution of Off-Diagonal Values in Adjacency Matrix")
plt.show()

In [None]:
src, dst = convert_toEdgeIndex(adjacency)
order = np.argsort(adjacency.data)[::-1]
src, dst, data = src[order], dst[order], adjacency.data[order]
mask = src != dst
src, dst, data = src[mask], dst[mask], data[mask]
src, dst, data

In [None]:
count = 1000
for s, d, v in zip(src, dst, data):
    if count == 0:
        break
    s = tokenizer.decode([s])
    d = tokenizer.decode([d])
    if (d not in ["go", "ask", "say", "one", "find", "get", "see", "come", "back", "take", "know", "think", "want", "tell", "look", "make", "like", "time", "day", "way", "thing",
                 "room", "house", "bed", "sleep", "night", "door", "walk", "people"]
        ) and ("#" not in d) and ("#" not in s) and (len(d)>2):
        print(s, "-", d, ":", v)
        count -= 1

In [None]:
mask = ((df_dreams["text_cleaned"].apply(lambda u: "mountain" in u.lower()
                                         and "mother" in u.lower() 
                                         )))
for x in df_dreams.loc[mask, "content"].values:
    print(x)