# Power & Language - Topic Model experiments

- [Data @OSF](https://osf.io/dwnxt/?view_only=e75faa4f54244361aa198e257b4fecf9)

## LWIC (write and load)

In [1]:
fn_lwic = "LIWC_German/LIWC_German.dic"

In [2]:
def load_lwic(fn):
    in_header = True
    categories = dict()
    word2cats = dict()
    cat2words = dict()
    with open(fn, "r") as fp:
        fp.readline()
        for line in fp:
            if line.startswith("%"):
                in_header = False
                continue

            if in_header:
                num, name = line.rstrip().split("\t")
                categories[int(num)] = name

            else:
                word, *cats = line.rstrip().split("\t")
                cats = list(map(int, cats))
                word2cats[word] = cats
                for cat in cats:
                    try:
                        cat2words[cat].append(word)
                    except KeyError:
                        cat2words[cat] = [word]

    return categories, word2cats, cat2words

In [3]:
categories, word2cats, cat2words = load_lwic(fn_lwic)
categories

{1: 'Pronoun',
 2: 'I',
 3: 'We',
 4: 'Self',
 5: 'You',
 6: 'Other',
 7: 'Negate',
 8: 'Assent',
 9: 'Article',
 10: 'Preps',
 11: 'Numbers',
 12: 'Affect',
 13: 'Positiveemotion',
 14: 'Positivefeeling',
 15: 'Optimism',
 16: 'Negativeemotion',
 17: 'Anxiety',
 18: 'Anger',
 19: 'Sad',
 20: 'Cognitivemechanism',
 21: 'Cause',
 22: 'Insight',
 23: 'Discrepancy',
 24: 'Inhibition',
 25: 'Tentative',
 26: 'Certain',
 31: 'Social',
 32: 'Communication',
 33: 'Otherreference',
 34: 'Friends',
 35: 'Family',
 36: 'Humans',
 37: 'Time',
 38: 'Past',
 39: 'Present',
 40: 'Future',
 41: 'Space',
 42: 'Up',
 43: 'Down',
 44: 'Incl',
 45: 'Excl',
 46: 'Motion',
 47: 'Occup',
 48: 'School',
 49: 'Job',
 50: 'Achieve',
 51: 'Leisure',
 52: 'Home',
 53: 'Sports',
 54: 'TV',
 55: 'Music',
 56: 'Money',
 57: 'Metaph',
 58: 'Relig',
 59: 'Death',
 60: 'Physical',
 61: 'Body',
 62: 'Sex',
 63: 'Eat',
 64: 'Sleep',
 65: 'Grooming',
 66: 'Swear',
 67: 'Nonfluency',
 68: 'Fillers'}

## Setup

In [4]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

### Download data

In [None]:
#import fasttext.util
#fasttext.util.download_model('de') #, if_exists='ignore')  # German

In [None]:
! python -m spacy download de_core_news_sm
#! python -m spacy download de_dep_news_trf

In [None]:
import nltk
nltk.download('stopwords')

## Env Imports

In [None]:
import re

import numpy as np
import pandas as pd

import nltk
import spacy

In [None]:
import spacy
    
import de_core_news_sm
#import de_dep_news_trf

nlp = de_core_news_sm.load()
#nlp = de_dep_news_trf.load()

#nlp = spacy.load("de_core_news_sm")
#nlp = spacy.load("de_dep_news_trf")

In [None]:
import fasttext

ft = fasttext.load_model('cc.de.300.bin')

## Load study data

In [10]:
def load_study1():
    df_study1 = pd.read_excel("Study 1/Data Study 1.xlsx")
    #print(df_study1.columns.to_list())

    # just keep useful columns
    df_study1 = df_study1[[
        # id
        "ID",
        # raw text
        "SourceB",
        # other meta
        "Alter", "Geschlecht",
        # self-evaluation (mean)
        "Power_mean", "Dom_mean", "Pres_mean",
        # outside-evaluation (mean)
        "Power_F", "Dom_F", "Pres_F"
    ]]

    # rename columns
    df_study1.rename(columns={
        "SourceB": "text", "Alter": "age", "Geschlecht": "gender",
        "Power_mean": "power", "Dom_mean": "dominance", "Pres_mean": "prestige",
        "Power_F": "power_f", "Dom_F": "dominance_f", "Pres_F": "prestige_f",
    }, inplace=True)
    
    return df_study1


def load_study2():
    df_study2 = pd.read_excel("Study 2/Data Study 2.xlsx")
    #print(df_study2.columns.to_list())

    # just keep useful columns
    df_study2 = df_study2[[
        # id
        "ID",
        # raw text
        "SourceA",
        # other meta
        "Alter", "Geschlecht",
        # self-evaluation (mean)
        "Power_means", "Dominanz_means", "Prestige_means",
        # outside-evaluation (mean)
        "Power_Fremdgesamt_means", "Dominanz_Fremdgesamt_means", "Prestige_Fremdgesamt_means",
        # WP?
        #"WP_means",, "WP_Fremdgesamt_means",
    ]]

    # rename columns
    df_study2.rename(columns={
        "SourceA": "text", "Alter": "age", "Geschlecht": "gender",
        "Power_means": "power", "Dominanz_means": "dominance", "Prestige_means": "prestige",
        "Power_Fremdgesamt_means": "power_f", "Dominanz_Fremdgesamt_means": "dominance_f", "Prestige_Fremdgesamt_means": "prestige_f",
        "WP_means": "wp", "WP_Fremdgesamt_means": "wp_f",
    }, inplace=True)

    return df_study2

In [None]:
#df_study1 = load_study1()
#df_study1.head()

In [None]:
#df_study2 = load_study2()
#df_study2.head()

## Cleanup data

In [11]:
def nlpize(df, nlp_fn):
    return df.map(nlp_fn)


def clean(df, stopwords=False, alpha=False, punctuation=True):
    # filter out stopwords
    if stopwords:
        df = df.map(lambda doc: list(filter(lambda tok: not tok.is_stop, doc)))

    # filter alphanumerical
    if alpha:
        df = df.map(lambda doc: list(filter(lambda tok: tok.is_alpha, doc)))

    # filter out punctuation
    if punctuation:
        df = df.map(lambda doc: list(filter(lambda tok: tok.pos_ not in ("PUNCT"), doc)))

    return df


def remove_punct(df):
    return df.map(lambda x: re.sub(r"[,\.!?]", '', x))


def lowercase_text(df):
    return df.map(lambda x: x.lower())


def get_text_by_pos(df, pos_list=("NOUN",), lemma=False, join=True):
    # filter each token by correct pos tag
    if pos_list:
      df = df.map(lambda x: list(filter(lambda tok: tok.pos_ in pos_list, x)))

    # convert tokens back to strings
    #df = df.map(lambda x: " ".join(map(str, x)))
    if lemma:
        df = df.map(lambda x: " ".join(map(lambda tok: tok.lemma_, x)))
    else:
        df = df.map(lambda x: " ".join(map(lambda tok: tok.text, x)))
    
    # concat to single text
    if not join:
        return df

    return ','.join(df.values.tolist())

## Analyze

### Load and preprocess data

In [12]:
# load data
df_study1 = load_study1()
df_study2 = load_study2()

In [13]:
%%time
#%%script false --no-raise-error
# tokenize, postag, ...
df_study1["text_spacy_doc"] = nlpize(df_study1["text"], nlp)
df_study2["text_spacy_doc"] = nlpize(df_study2["text"], nlp)

CPU times: user 10.9 s, sys: 0 ns, total: 10.9 s
Wall time: 25.3 s


In [14]:
%%time
# remove punctuation
do_clean_stopwords = True
df_study1["text_spacy_doc_filtered"] = clean(df_study1["text_spacy_doc"], stopwords=do_clean_stopwords, alpha=False, punctuation=True)
df_study2["text_spacy_doc_filtered"] = clean(df_study2["text_spacy_doc"], stopwords=do_clean_stopwords, alpha=False, punctuation=True)

# take raw text `tok.text` instead of lemma `tok.lemma_`
df_study1["tokens"] = df_study1["text_spacy_doc_filtered"].map(lambda doc: list(map(lambda tok: tok.text, doc)))
df_study2["tokens"] = df_study2["text_spacy_doc_filtered"].map(lambda doc: list(map(lambda tok: tok.text, doc)))
# convert to plain string
df_study1["tokens"] = df_study1["tokens"].map(lambda doc: list(map(str, doc)))
df_study2["tokens"] = df_study2["tokens"].map(lambda doc: list(map(str, doc)))

# concat both studies
#docs = pd.concat([df_study1["tokens"], df_study2["tokens"]], ignore_index=True)
#docs_raw = docs.map(lambda doc: " ".join(doc))

CPU times: user 47.3 ms, sys: 0 ns, total: 47.3 ms
Wall time: 37.1 ms


### Generate lwic cluster embeddings

In [15]:
categories, word2cats, cat2words = load_lwic(fn_lwic)

In [16]:
%%time
# generate mean vectors based on category words
cat2emb = dict()

for cat, words in cat2words.items():
    vectors = list()
    weights = [1.0] * len(words)
    for word, weight in zip(words, weights):
        if word.endswith("*"):
            word = word[:-1]
        emb = ft.get_word_vector(word)
        #vectors.append([word, weight, emb])
        vectors.append(emb)
    cat_emb = np.array(vectors).mean(axis=0, dtype=np.float64)
    cat2emb[cat] = cat_emb

CPU times: user 38.1 s, sys: 0 ns, total: 38.1 s
Wall time: 38.1 s


In [17]:
import numpy as np
from gensim import matutils


def compute_most_similar(vector_1, vectors_all, topn=10):
    """Compute most similar for `vector_1` to `vectors_all`. Normalizes the
    vectors and computes the cosine (dot product).
    Return the `topn` best matches."""
    # from most_similar(..) L491 (Word2Vec gensim)
    # assumes vectors_all is normalized
    # mean = matutils.unitvec(np.array([vector_1]).mean(axis=0)).astype(np.float32)
    # distances = np.dot(vectors_all[0:len(vectors_all)], mean)

    # from cosine_similarities(..) L883 (Word2Vec gensim)
    # computes norms on the fly and normalizes (L2) both inputs
    norm = np.linalg.norm(vector_1)
    all_norms = np.linalg.norm(vectors_all, axis=1)
    dot_products = np.dot(vectors_all, vector_1)
    distances = dot_products / (norm * all_norms)

    # from most_similar
    best = matutils.argsort(distances, topn=topn + len(vectors_all), reverse=True)
    result = [(sim_idx, float(distances[sim_idx])) for sim_idx in best]
    return result[:topn]


def compute_most_similar_labeled_base(vector, vectors_all, labels, topn=10):
    """Transform most similar result from indices to vector labels
    (i. e. words)."""
    results = compute_most_similar(vector, vectors_all, topn=topn)
    results = [(labels[i], sim) for i, sim in results]
    return results

In [18]:
from collections import Counter

num_vec_sims = 5
cat_labels, cat_embs = zip(*cat2emb.items())

num_found = num_inferred = 0

def map_tokens2category(tokens):
    global num_found, num_inferred
    label_cnt = Counter()
    for word in tokens:
        word = word.lower()
        if word in word2cats:
            label_cnt.update(word2cats[word])
            num_found += 1
            continue

        emb = ft.get_word_vector(word)
        num_inferred += 1
        labels = compute_most_similar_labeled_base(emb, list(cat_embs), cat_labels, topn=num_vec_sims)
        label_cnt.update([l for l, _ in labels])

    top_labels = label_cnt.most_common(10)
    top_labels = sorted(top_labels, key=lambda x: (x[1], x[0]))
    top_labels = [(categories[ln], cnt) for ln, cnt in top_labels]
    #top_labels = [ln for ln, _ in top_labels]
    return top_labels


#map_tokens2category(df_study1["tokens"].iloc[0])

In [19]:
%%time
num_found = num_inferred = 0  # reset counter

df_study1["cats"] = df_study1["tokens"].map(map_tokens2category)
df_study1["cats_plain"] = df_study1["cats"].map(lambda x: [n for n, _ in x])
print(f"study1: {num_found=}, {num_inferred=}")
num_found = num_inferred = 0

df_study2["cats"] = df_study2["tokens"].map(map_tokens2category)
df_study2["cats_plain"] = df_study2["cats"].map(lambda x: [n for n, _ in x])
print(f"study2: {num_found=}, {num_inferred=}")
num_found = num_inferred = 0

df_study1["cats_plain"]

  distances = dot_products / (norm * all_norms)


study1: num_found=633, num_inferred=3746


  distances = dot_products / (norm * all_norms)


study2: num_found=560, num_inferred=4886
CPU times: user 59.9 s, sys: 8min 37s, total: 9min 37s
Wall time: 1min 6s


0      [Anxiety, Time, Cognitivemechanism, Excl, Opti...
1      [Anxiety, Friends, Metaph, Optimism, Job, Affe...
2      [Sex, Family, Present, School, Job, Leisure, T...
3      [Positiveemotion, Positivefeeling, Cognitiveme...
4      [Death, Positivefeeling, Cognitivemechanism, C...
                             ...                        
195    [Positivefeeling, Friends, Occup, School, Leis...
196    [Humans, Positivefeeling, Cognitivemechanism, ...
197    [Anger, Metaph, Negativeemotion, Certain, Occu...
198    [Preps, Occup, School, Affect, Positivefeeling...
199    [Preps, Affect, Positiveemotion, Anxiety, TV, ...
Name: cats_plain, Length: 200, dtype: object

In [29]:
df_study1[["text", "cats_plain"]].iloc[0]

text          Ich würde mich selbst als fleißig, disziplinie...
cats_plain    [Anxiety, Time, Cognitivemechanism, Excl, Opti...
Name: 0, dtype: object

### Cluster?

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [21]:
docs = df_study1["cats_plain"].map(lambda x: " ".join(x))

In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

In [23]:
X.todense().shape  # number samples x vocabulary
# vectorizer.vocabulary_

(200, 59)

In [24]:
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

df_study1["cats_plain_cluster"] = model.predict(X)
#df_study1["cats_plain_cluster"]

print("distribution")
print(df_study1["cats_plain_cluster"].value_counts())

#df_study1[df_study1["cats_plain_cluster"] == 1]
#df_study1["cats_plain_cluster"].value_counts()

#check word freq for cluster==1
#df[df.cluster==1].words.value_counts()

distribution
1    37
5    27
7    23
3    22
0    20
9    18
4    17
6    14
2    12
8    10
Name: cats_plain_cluster, dtype: int64


## Other stuff