### This is the code used to match the crawled forum data to Behavior Change Annotations
Matching was last successfully tested in December, 2021

In [120]:
import pandas as pd
import numpy as np
import re

In [2]:
abnehmen = pd.read_json(path_or_buf="./Crawler/abnehmenOhneOp.json")
psycho = pd.read_json(path_or_buf="./Crawler/psychoTherapie.json")
anno_ids = pd.read_csv("annotations_all_ids.csv", sep=';').sort_values(["Annotation_post_id", "Annotation_Satz-ID"])

In [23]:
abnehmen["forum_name"] = "Abnehmen ohne OP"
psycho["forum_name"] = "Psychologsiche Therapie"

fora = pd.concat([psycho, abnehmen]).sort_values("post_id")

we check that crawled forums and annotations have the same number of posts:

In [24]:
print("Annotated number of posts: ", anno_ids["Annotation_post_id"].nunique())
print("Crawled number of posts: ", len(fora))

Annotated number of posts:  1202
Crawled number of posts:  1202


It seems we are missing a post in the crawled data. It could have been removed since original data collection. We remove this post from the annotations.

In [25]:
anno_ids = anno_ids[anno_ids["Annotation_post_id"].isin(fora["post_id"])]

In [26]:
anno_ids = pd.read_csv("annotions_crawled.csv")
fora.to_csv("fora.csv")

In [28]:
data = pd.read_csv("fora.csv").sort_values("post_id")
data.drop(columns={"Unnamed: 0"}, inplace=True)
data.head()

Unnamed: 0,title,thread_id,post_id,date,username,content,forum_name
0,wie lange zum Psychologen bis es das Attest gi...,16899,181007,"26. März 2006, 21:09",Eisy,\n\nWie lange muß man sich Psychologisch behan...,Psychologsiche Therapie
1,auf was muß das Attest beim Psychlogen Ausgest...,17085,185772,"4. April 2006, 15:52",Eisy,\n\nSo Termin steht habe mich entschieden wenn...,Psychologsiche Therapie
2,*klick*,17305,186269,"5. April 2006, 12:32",rebecca76,"\n\nhallöchen zusammen:499: :499:\n\nso, man l...",Psychologsiche Therapie
3,*klick*,17305,186322,"5. April 2006, 13:36",rebecca76,\n\n@michael....mist...hier ises nun auch fast...,Psychologsiche Therapie
4,*klick*,17305,186330,"5. April 2006, 13:44",Gabriella,"\n\nHallo Rebecca ,,\n\nich mache seit ungefäh...",Psychologsiche Therapie


In [30]:
delimiter = "REPLACE"   
 
def break_line(match):
   return match.group() + delimiter

dict_of_regex_replacements = {r"z\. *b\.": "zb",  r"z\. *B\.": "zb",r"EW\.": "EW",r"Z\. *B\.": "zb", r"zB\.": "zb",
                       r"z\.B\.": "zb", r"zb\.": "zb",r"z\.B": "zb", r"z\.b": "zb", r"u\. *a\.": "ua", r"etc\.": "etc", r"etc\.\)": "etc",
                       r" eig\.": "eig", r"l\.g\.": "lg", r"o\.k\.": "ok", r"ca\.": "ca", r"Ca\.": "ca", r"d\.m\.": "dm", r"d\.M\.": "dm",
                       r"p\. *s\.": "ps", r"P\. *s\.": "ps", r"P\. *S\.": "ps", r"z\.t\.": "zt", r"wg\.": "wg",r"\.-": "-",
                       r"-\.": "-", r"Tg\.": "Tg", r"o\. *ä\.": "oä", r"d\. *h\.": "dh", r"D\. *H\.": "dh", r"d\. *H\.": "dh",
                       r"dH\.": "dh",r"dh\.": "dh", r"z\. *Zt\.": "zzt", r"Dr\.": "Dr", r"v\. *a\.": "va", r"\d\d\.*\d\d\.": "xx/xx/",
                       r"s\. *u\.": "su", r"u\.s\.w\.": "usw", r"usw\.":"usw",r"soz\.": "soz", r"vllt\.": "vlt", r"\(\!\)": " ",
                       r"ank\.tzt": "ankotzt", r"\!\" z\.B\.": "\" zb",r"Gr\.": "Gr", r"Dez\.": "Dez", r"(\d+\.)(\d+\.)\d*": "xx/xx/xxxx",
                       r"(\d+\.)(\d+) ": "12,3 ",r"(\d+\.)(\d+)": "xx/xx", r"\d+\.\d+\.": "xx/xx", r"bz\.w": "bzw", r"Bzw\.": "bzw", 
                       r"bzw\.": "bzw", r"inkl\.": "inkl", r"psych\.": "psych",r"mind\.": "mind", r"Min\.": "Min", r" min\.": "min", r"Verh\.": "Verh",
                       r"Ern\.": "Ern", r"bezgl\.": "bezgl", r"ltd\.": "ltd", r"bspw\.": "bspw", r"ltr\.": "ltr", r"anschl\.": "anschl",
                       r"s\.oliver": "soliver", r" event\.": " event", r"std\.": "std", r"Std\.": "std", r"max\.": "max", r"L\.G\.": "lg",
                       r"LG\.": "lg", r"tägl\.": "tägl", r" u\.": " u", r" u\.":" u", r" od\.": " od",r"Co\.": "Co", r"co\.": " co",r"bzgl\.": "bzgl", r"evtl\.": "evtl",
                       r"pos\.": "evtl", r"M\.O\.B\.I\.L\.I\.S": "mobilis",r"m\.o\.b\.i\.l\.i\.s\.": "mobilis", "fddb\.info":"fddb_info", r"i\.d\.R\.*": "idR",
                       r"z\. *T\.":"zT", r"Vit\.":"Vit",r"ggf\.":"ggf", r"m\.E\.": "mE", r"k\.o": "ko", r"bezw\.":"bzw",r"Dh\.":"dh", r"Nr\.":"Nr", r"Evtl\.":"evtl",
                       r"O\.k\.":"ok", r"D\.h\.":"dh", r"zBsp\.":"zb", r"U\.a\.":"Ua", r"Bzgl\.":"bzgl", "gr\.":"gr", "d\.h":"dh", "i\.d\.R\.":"idR"}

dict_of_non_regex_replacements = {"etc.": "etc", " u.": " u", "z.b.": "zb", " ca.": " ca", "Co.": "Co", "ggf.":"ggf","ua.":"ua", "U.a.":"ua",
                               "Bzgl.": "bzgl", "Mrs.":"Mrs", "Nr.":"Nr", "z.T.":"zT", "gr.": "gr", "(!)": "()", "d.h": "dh", "D.h.":"dh",
                               "Vit.":"Vit", "m.E.": "mE", "k.o": "ko", "bezw.":"bzw", "O.k.":"ok", "Dh.":"dh", "Evtl.":"evtl", "Bsp.":"Bsp", "Bzgl.":"bzgl", "d.h":"dh"}


data["split"] = ""

for key in dict_of_regex_replacements.keys():
    data["content"].replace(key, dict_of_regex_replacements[key], inplace=True, regex=True)
    
for key in dict_of_non_regex_replacements.keys():
    data["content"].replace(key, dict_of_non_regex_replacements[key], inplace=True)

for index, row in data.iterrows():
    data.loc[index, "split"] = re.sub(r"!+|\?+|\.+|\!\?+|\?\!+|\.+\?+", break_line, data.loc[index,"content"])

data = data.replace(r'\n',' ', regex=True) 
data.drop(columns="content", inplace=True)
data = data.assign(split=data['split'].str.split(delimiter)).explode('split', ignore_index=True)
len(data)

16221

In [31]:
data.head()

Unnamed: 0,title,thread_id,post_id,date,username,forum_name,split
0,wie lange zum Psychologen bis es das Attest gi...,16899,181007,"26. März 2006, 21:09",Eisy,Psychologsiche Therapie,Wie lange muß man sich Psychologisch behande...
1,wie lange zum Psychologen bis es das Attest gi...,16899,181007,"26. März 2006, 21:09",Eisy,Psychologsiche Therapie,da ich noch nicht lange in Kiel bin wäre schö...
2,auf was muß das Attest beim Psychlogen Ausgest...,17085,185772,"4. April 2006, 15:52",Eisy,Psychologsiche Therapie,So Termin steht habe mich entschieden wenn e...
3,*klick*,17305,186269,"5. April 2006, 12:32",rebecca76,Psychologsiche Therapie,"hallöchen zusammen:499: :499: so, man liest..."
4,*klick*,17305,186269,"5. April 2006, 12:32",rebecca76,Psychologsiche Therapie,aber was brauchts dazu dass es *klick* macht???


In [32]:
nan_value = float("NaN")

list_to_replace = ["", " ", "  ", "    ", "   ", "           ", "      ", "!", "?", ".", "!!", "??", "..",
                  "!!!", "???", "...", " !", " ?", " .", " !!", " ??", " ..", " !!!", " ???", "????", "  ...", "  ....",
                  " ....", "....", ".......", " ......", "    ", ".......", " ......", "  ....", "      ", " ...", "                   ",
                  " .....", ".....", "    ...", "     ...", "  .", "   .", "  ????", "     ", "        "]
                   
for item in list_to_replace:
    data["split"].replace(item, nan_value, inplace=True) 

data.dropna(subset = ["split"], inplace=True)
data.reset_index(inplace=True)

# Making sure that both datasets have the same length
if (len(data) == len(anno_ids)):
    print("Both datasets have a length of", len(data))

Both datasets have a length of 15533


Aside: If they do not have the same length, posts might have been modified since the last testing of this code. In this case, it would be best to just drop those posts, that do not have the same length (i.e. compare value counts)

### We can now look at the posts, that have a single label and polarization

These were used for the testing Inter-Rater reliability and conducting machine learning experiments

In [70]:
annotations = anno_ids[["Annotation_thread_id", "Annotation_post_id","Annotation_Satz-ID","Label", "Sublabel", "Polarization"]]
forum_splitted = pd.concat([data,annotations], axis=1)
forum_splitted.to_csv("forum_splitted.csv")

In [135]:
forum_splitted = pd.read_csv("forum_splitted.csv")

In [None]:
categories = ["C", "TS", "R"]
polarization = ["+", "-"]

forum_splitted = forum_splitted[forum_splitted["Label"].isin(categories)]
forum_splitted = forum_splitted[forum_splitted["Polarization"].isin(polarization)]

In [74]:
forum_splitted.to_csv("Annotation_crawled_data.csv")

In [43]:
from sklearn.model_selection import train_test_split

In [103]:
forum_splitted = pd.read_csv("Annotation_crawled_data.csv")

In [104]:
forum_splitted["Polarization"].replace("+", 1, inplace=True)
forum_splitted["Polarization"].replace("-", 0, inplace=True)

In [105]:
forum_splitted["Label"].replace("R", 0, inplace=True)
forum_splitted["Label"].replace("TS", 1, inplace=True)
forum_splitted["Label"].replace("C", 2, inplace=True)

forum_splitted["Sublabel"].fillna(0, inplace=True)
forum_splitted["Sublabel"].replace("a", 1, inplace=True)
forum_splitted["Sublabel"].replace("d", 2, inplace=True)
forum_splitted["Sublabel"].replace("n", 3, inplace=True)

### Stratified Train-test split
The training set will be used for cross validation

In [108]:
binary = forum_splitted[["split", "Polarization"]].copy()
binary.rename(columns={"split":"Sentence", "Polarization":"labels"}, inplace=True)
labels = forum_splitted[["split", "Label"]].copy()
labels.rename(columns={"split":"Sentence", "Label":"labels"}, inplace=True)

# Sublabels are only applicable to sentences with label R. We subset the dataset accordingly.
sublabels = forum_splitted[forum_splitted["Label"] == 0]
sublabels = sublabels[["split", "Sublabel"]].copy()
sublabels.rename(columns={"split":"Sentence", "Sublabel":"labels"}, inplace=True)
binary_train, binary_test = train_test_split(binary,test_size=0.2, random_state=42, stratify=binary["labels"])
label_train, label_test = train_test_split(labels,test_size=0.2, random_state=42, stratify=labels["labels"])
sublabel_train, sublabel_test = train_test_split(sublabels,test_size=0.2, random_state=42, stratify=sublabels["labels"])

In [119]:
binary_sustain = binary_train[binary_train["labels"] == 0]
binary_change = binary_train[binary_train["labels"] == 1]
binary_change_undersampled = binary_change.sample(binary_sustain.shape[0])
binary_train_balanced = pd.concat([binary_sustain, binary_change_undersampled])
binary_train_balanced = binary_train_balanced.sample(frac=1)

binary_train_balanced.to_csv("valence_train_balanced.csv")
binary_test.to_csv("valence_test.csv")

label_R = label_train[label_train["labels"] == 0]
label_TS = label_train[label_train["labels"] == 1]
label_C = label_train[label_train["labels"] == 2]
label_R_undersampled = label_R.sample(label_TS.shape[0])
label_train_balanced = pd.concat([label_R_undersampled, label_TS, label_C])
label_train_balanced = label_train_balanced.sample(frac=1)

label_train.to_csv("label_train_balanced.csv")
label_test.to_csv("label_test.csv")


sublabel_R = sublabel_train[sublabel_train["labels"] == 0]
sublabel_Ra = sublabel_train[sublabel_train["labels"] == 1]
sublabel_Rd = sublabel_train[sublabel_train["labels"] == 2]
sublabel_Rn = sublabel_train[sublabel_train["labels"] == 3]
sublabel_R_undersampled = sublabel_R.sample(sublabels_Ra.shape[0])
sublabel_train_balanced = pd.concat([sublabel_R_downsampled, sublabel_Ra, sublabel_Rn, sublabel_Rd])
sublabel_train_balanced = sublabel_train_balanced.sample(frac=1)
sublabel_train.to_csv("sublabel_train_balanced.csv")
sublabel_test.to_csv("sublabel_test.csv")

'sublabel_train.to_csv("sublabel_train.csv")\nsublabel_test.to_csv("sublabel_test.csv")'

### same process for split by User activity level

The 65 most active users produced 80% of the data. We want to test whether machine learning results are biased by user-specific language. So we produce training and test sets that allow us to train on the most active users and predict for the rest.

In [None]:
most_active = list(counts["username"][:65])
train = forum_splitted[forum_splitted["username"].isin(most_active)]
test = forum_splitted[~forum_splitted["username"].isin(most_active)]

In [None]:
binary_train = train[["split", "Valence"]].copy()
binary_train.rename(columns={"split":"Sentence", "Valence":"labels"}, inplace=True)

binary_sustain = binary_train[binary_train["labels"] == 0]
binary_change = binary_train[binary_train["labels"] == 1]
binary_change_downsampled = binary_change.sample(binary_sustain.shape[0])
binary_train_balanced = pd.concat([binary_sustain, binary_change_downsampled])
binary_train_balanced = binary_train_balanced.sample(frac=1)

binary_test = test[["split", "Valence"]].copy()
binary_test.rename(columns={"split":"Sentence", "Valence":"labels"}, inplace=True)

In [None]:
label_train = train[["split", "Label"]].copy()
label_train.rename(columns={"split":"Sentence", "Label":"labels"}, inplace=True)

label_R = label_train[label_train["labels"] == 0]
label_TS = label_train[label_train["labels"] == 1]
label_C = label_train[label_train["labels"] == 2]
label_R_downsampled = label_R.sample(label_TS.shape[0])
label_train_balanced = pd.concat([label_R_downsampled, label_TS, label_C])
label_train_balanced = label_train_balanced.sample(frac=1)

label_test = test[["split", "Label"]].copy()
label_test.rename(columns={"split":"Sentence", "Label":"labels"}, inplace=True)

In [None]:
# Again, we only use sentences with label R for our sublabel training and test set
sublabels_R = train[train["Label"] == 0]
sublabels_train = sublabels_R[["split", "Sublabel"]].copy()
sublabels_train.rename(columns={"split":"Sentence", "Sublabel":"labels"}, inplace=True)

sublabels_R = sublabels_train[sublabels_train["labels"] == 0]
sublabels_Ra = sublabels_train[sublabels_train["labels"] == 1]
sublabels_Rd = sublabels_train[sublabels_train["labels"] == 2]
sublabels_Rn = sublabels_train[sublabels_train["labels"] == 3]
sublabels_R_undersampled = sublabels_R.sample(sublabels_Ra.shape[0])
sublabels_train_balanced = pd.concat([sublabels_R_undersampled, sublabels_Ra, sublabels_Rn, sublabels_Rd])
sublabels_train_balanced = sublabels_train_balanced.sample(frac=1)

sublabels_test_R = test[test["Label"] == 0]
sublabels_test = sublabels_test_R[["split", "Sublabel"]].copy()
sublabels_test.rename(columns={"split":"Sentence", "Sublabel":"labels"}, inplace=True)

In [None]:
binary_train_balanced.to_csv("valence_most_active_train_balanced.csv")
binary_test.to_csv("valence_least_active_test.csv")

label_train_balanced.to_csv("label_most_active_train_balanced.csv")
label_test.to_csv("label_least_active_test.csv")

sublabels_train_balanced.to_csv("sublabel_most_active_train_balanced.csv")
sublabels_test.to_csv("sublabel_least_active_test.csv")