In [1]:
import collections
import itertools
import logging
import os.path
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(name)s: %(message)s")

import data

  from .autonotebook import tqdm as notebook_tqdm


### Prepare data

In [2]:
fn_study_prepared = "studydata.pickle"

if not os.path.exists(fn_study_prepared):
    df_study1, df_study2 = data.prepare_study_data()

    with open(fn_study_prepared, "wb") as fp:
        pickle.dump(df_study1, fp, protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(df_study2, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(fn_study_prepared, "rb") as fp:
    df_study1 = pickle.load(fp)
    df_study2 = pickle.load(fp)

df_study1, df_study2 = data.clean_study_data(df_study1, df_study2)

[INFO] data: Clean study data ...


### Distribution of targets

In [None]:
df_study1[["power", "dominance", "prestige", "power_f", "dominance_f", "prestige_f"]].hist()

In [None]:
df_study2[["power", "dominance", "prestige", "workplace_power", "power_f", "dominance_f", "prestige_f", "workplace_power_f"]].hist()

In [None]:
what = "power"

print(what, end=":\n")
print("  min=", df_study1[what].min())
print("  max=", df_study1[what].max())
# quantiles (low / mid / hig)
df_study1[["power", "dominance", "prestige"]].quantile([0, 1/3, 2/3, 3/3])

# normal hist
#df_study1["power"].hist()
# by quantiles
#(df_study1["power"] / df_study1["power"].abs().max()).hist()

In [None]:
what = "power"

# by quantiles
df_study1[what].hist()
min_ylim, max_ylim = plt.ylim()
q33 = df_study1[what].quantile(1/3)
q66 = df_study1[what].quantile(2/3)
plt.axvline(q33, color='k', linestyle='dashed', linewidth=1)
plt.text(q33*1.01, max_ylim*0.9, '{:.2f}'.format(q33))
plt.axvline(q66, color='k', linestyle='dashed', linewidth=1)
plt.text(q66*1.01, max_ylim*0.9, '{:.2f}'.format(q66))

In [None]:
# logging.getLogger().setLevel(logging.DEBUG)
# _ = data.get_lmh_quantiles_mask(df_study1, "power")

In [None]:
df_study1_quants = df_study1[["power", "dominance", "prestige", "power_f", "dominance_f", "prestige_f"]].quantile([1/3, 2/3])
df_study1_quants

In [None]:
df_study2_quants = df_study2[["power", "dominance", "prestige", "workplace_power", "power_f", "dominance_f", "prestige_f", "workplace_power_f"]].quantile([1/3, 2/3])
df_study2_quants

### Score comparisons

In [None]:
pos = ("NOUN", "PROPN")
#pos = ("ADJ",)
#pos = ("ADV",)
#pos = ("VERB",)
# https://universaldependencies.org/u/pos/
lemma = True
relative = False
total_occ_min = 10  # at least 10 occurences (summed)
#total_occ_min = 5   # if ADJ then less words required

for what in ("power", "dominance", "prestige"):
    df_lmh = data.make_word_freq_score_lmh_comparison_df(df_study1, what, pos=pos, lemma=lemma, relative=relative, total_occ_min=total_occ_min)
    df_lmh.plot(kind="barh")
    plt.title("Words for '{}' for {}".format(what.title(), ", ".join(pos)))

In [None]:
range_ = (0., 1/3)
range_ = (1/3, 2/3)
#range_ = (2/3, 1.)

whats = ("power", "dominance", "prestige")
whats = ("power_f", "dominance_f", "prestige_f")
#whats = ("power", "dominance", "prestige", "power_f", "dominance_f", "prestige_f")
relative = True

df_h_pdp = data.make_word_freq_score_pdp_comparison_df(df_study1, whats=whats, range_=range_, pos=pos, lemma=lemma, relative=relative, total_occ_min=total_occ_min)
df_h_pdp.plot(kind="barh")
plt.title("Words for quantile {:.2f}-{:.2f} for {}".format(*range_, ", ".join(pos)))

In [None]:
range_ = (2/3, 1.)
relative = True
whatss = [("power", "power_f"), ("dominance", "dominance_f"), ("prestige", "prestige_f")]

pos = ("NOUN", "PROPN")
#pos = ("ADJ",)
#pos = ("ADV",)
pos = ("VERB",)
total_occ_min = 10  # at least 10 occurences (summed)
#total_occ_min = 5   # if ADJ then less words required

for whats in whatss:
    df_h_pdp = data.make_word_freq_score_pdp_comparison_df(df_study1, whats=whats, range_=range_, pos=pos, lemma=lemma, relative=relative, total_occ_min=total_occ_min)
    df_h_pdp.plot(kind="barh")
    plt.title("Words for quantile {:.2f}-{:.2f} for {}".format(*range_, ", ".join(pos)))

### Generate outputs

In [None]:
data.write_freqs_to_excel(df_study1, "study1-output.xlsx")
data.write_freqs_to_excel(df_study2, "study2-output.xlsx")

In [None]:
data.generate_freqs_figures(df_study1, "figures_study1")
data.generate_freqs_figures(df_study2, "figures_study2")

In [122]:
import importlib
importlib.reload(data)

logging.getLogger().setLevel(logging.INFO)

### Feature Importance

In [None]:
doc_term_mat = data.train_prepare(df_study1)

for what in ("power", "dominance", "prestige"):
    clf = data.train_model(df_study1, what, doc_term_mat=doc_term_mat)
    coefs = clf.coef_.copy()

    #coefs = data.normalize_coefs(coefs)

    coefs = coefs * np.linspace(-1, 1, len(clf.classes_))[:,np.newaxis]
    coefs = coefs.sum(axis=0)

    coefs = data.normalize_coefs(coefs)

    values, labels = data.coef_filter(coefs, clf.feature_names_in_)
    desc = data.coef_to_human(values, labels)
    print(what)
    print(desc)
    print()

In [124]:
doc_term_mat = data.train_prepare(df_study1)
for what in ("power", "dominance", "prestige"):
    clf = data.train_model(df_study1, what, doc_term_mat=doc_term_mat)
    data.write_coefs_to_excel(clf, what, fn_output="study1-coefs.xlsx", require_both=True)

doc_term_mat = data.train_prepare(df_study2)
for what in ("power", "dominance", "prestige", "workplace_power"):
    clf = data.train_model(df_study2, what, doc_term_mat=doc_term_mat)
    data.write_coefs_to_excel(clf, what, fn_output="study2-coefs.xlsx", require_both=True)

In [73]:
df_sub = df_study1["text_spacy_doc_filtered"]
#doc_term_mat, features = data.build_count_matrix(df_sub)
doc_term_mat, features = data.build_feature_matrix(df_sub, norm="l2", use_idf=True)
# doc_term_mat.toarray()

import sklearn.linear_model
import sklearn.preprocessing

#clf = sklearn.linear_model.Lasso(alpha=0.1)
clf = sklearn.linear_model.LogisticRegression()

X = doc_term_mat
y = df_study1["power"].to_numpy()
#y = y.astype(int)
y = np.vectorize(round)(y)
# to interval [0, 1]
#y = (y - np.min(y)) / np.ptp(y)
#y = y[:,np.newaxis]
#plt.hist(y)

clf.fit(X, y)
clf.score(X, y)

#np.sort(clf.coef_[np.where(clf.coef_ > 0)])

0.89

In [None]:
# coefficients
C = clf.coef_
print("\ncoefficients:")
print(C)
# C / C.sum(axis=0)

# norm: [0, 1]
#C = (C - np.min(C)) / np.ptp(C)
# norm: [-1, 1]
C = 2. * (C - np.min(C)) / np.ptp(C) - 1
#C = C / 2.

# print words
print("\nfeature importance:")
for class_, Cc in zip(clf.classes_, C):
    srt = np.argsort(np.abs(Cc))[::-1]
    mask = np.abs(Cc) > 0.25
    mask_srt = mask[srt]
    mask_srt[10:] = False
    labels_srt = np.array(features)[srt][mask_srt]
    values_srt = np.array(Cc)[srt][mask_srt]
    srt = np.argsort(values_srt)[::-1]
    labels_srt = labels_srt[srt]
    values_srt = values_srt[srt]
    desc = " + ".join("{:.2f}*'{}'".format(val, lbl) for lbl, val in zip(labels_srt, values_srt))
    #desc = " + ".join("{:.2f}*'{}'".format(val, labels[i]) for i, val in enumerate(Cc))
    print("Class", class_, "=", desc, end="\n\n")

In [113]:
C = clf.coef_
C = data.normalize_coefs(C)

dfs_coefs = []
for class_, Cc in zip(clf.classes_, C):
    values, labels = data.coef_filter(Cc, features, require_both=False)
    col_lbl = "Class {:d}".format(int(class_))
    cols = pd.MultiIndex.from_tuples([(col_lbl, "words"), (col_lbl, "coefs")])
    df_coef = pd.DataFrame.from_records(data=zip(labels, values), columns=cols)
    dfs_coefs.append(df_coef)

df_coefs = pd.concat(dfs_coefs, axis=1)
#df_coefs.columns = pd.MultiIndex.from_tuples(itertools.chain.from_iterable((("Class {:d}".format(int(class_)), "words"), ("Class {:d}".format(int(class_)), "coefs")) for class_ in clf.classes_))
df_coefs

Unnamed: 0_level_0,Class 3,Class 3,Class 4,Class 4,Class 5,Class 5,Class 6,Class 6,Class 7,Class 7
Unnamed: 0_level_1,words,coefs,words,coefs,words,coefs,words,coefs,words,coefs
0,empathisch,0.803768,kg,1.0,eher,0.630842,Menschen,0.943616,selbstbewusst,0.727885
1,nachdenklich,0.675783,zurecht,0.657515,Kinder,0.619512,Freunde,0.92256,kommunikativ,0.495415
2,schwere,0.551223,wiege,0.651797,Manchmal,0.617667,Mensch,0.84091,lesen,0.493894
3,freundlich,0.439724,kennenzulernen,0.649295,Empathie,-0.611244,aufgeschlossen,0.709363,Neues,0.459237
4,talentiert,0.404398,Partnerschaft,0.555572,aufgeschlossen,-0.625688,hilfsbereit,0.672065,Drücker,0.415498
5,Hilfsbereit,0.404398,introvertiert,0.51716,komme,-0.632858,umgehen,0.557359,bearbeiten,0.415498
6,verbindlich,0.404398,Teilzeit,0.501018,aufbrausend,-0.640129,manchmal,0.534792,Handwerklich,0.415498
7,ausbaufähig,0.404398,unterwegs,0.493105,kommunikativ,-0.648217,zielstrebig,0.515501,gebe,0.409377
8,Anpassungsfähig,0.404398,arbeite,0.468205,kg,-0.648412,Mann,-0.531523,letzten,0.363334
9,Romantiker,0.393644,inzwischen,0.466566,beruflich,-0.658755,arbeite,-0.549577,begabt,0.352777


In [None]:
# coefficients
C = clf.coef_
#print("\ncoefficients:")
#print(C)
# C / C.sum(axis=0)

C = data.normalize_coefs(C)

# print words
print("\nfeature importance:")
for class_, Cc in zip(clf.classes_, C):
    values, labels = data.coef_filter(Cc, features, require_both=False)
    desc = data.coef_to_human(values, labels)
    print("Class {}: {}\n".format(class_, desc))

print("\nfeature importance:")
for class_, Cc in zip(clf.classes_, C):
    values, labels = data.coef_filter(Cc, features, require_both=True)
    desc = data.coef_to_human(values, labels)
    print("Class {}: {}\n".format(class_, desc))