# The Statistical Approach

In [None]:
from __future__ import annotations

import json

import pandas as pd

from project import load_dataset

In [None]:
DATASET = load_dataset("dataset.csv")
DATASET


Unnamed: 0,ID,Issue,Stance,Argument,Annotator,Argumentative,CO,LA,LR,LS,...,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV
0,arg219250,ban-plastic-water-bottles,no-bad-for-the-economy,"it is true that bottled water is a waste, but ...",1,y,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
1,arg219250,ban-plastic-water-bottles,no-bad-for-the-economy,"it is true that bottled water is a waste, but ...",2,y,1,3,2,1,...,2,2,3,2,2,2,3,1,1,1
2,arg219250,ban-plastic-water-bottles,no-bad-for-the-economy,"it is true that bottled water is a waste, but ...",3,y,2,2,3,2,...,2,1,2,2,2,2,2,2,2,2
3,arg219293,ban-plastic-water-bottles,no-bad-for-the-economy,Most Americans on average recycle 86-88% of th...,1,y,2,3,3,2,...,3,2,2,2,2,2,2,3,2,2
4,arg219293,ban-plastic-water-bottles,no-bad-for-the-economy,Most Americans on average recycle 86-88% of th...,2,y,1,2,2,1,...,2,2,2,1,2,1,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,arg168822,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,Raffles neglected Singapore when he went aroun...,2,y,1,2,2,1,...,2,2,2,3,2,2,3,2,1,2
930,arg168822,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,Raffles neglected Singapore when he went aroun...,3,y,2,2,3,2,...,2,2,2,2,2,2,2,3,2,2
931,arg168834,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,"Raffles doesn't care about the citizens, doesn...",1,y,2,2,3,2,...,2,2,1,2,2,2,2,3,2,2
932,arg168834,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course,"Raffles doesn't care about the citizens, doesn...",2,y,1,2,2,1,...,2,3,3,3,2,2,2,2,1,2


## Dataset Preprocessing

### Data Aggregation

As we are treating the problem as a classification problem, we will adopt the majority vote as the aggregation method. As suggested by the original paper, when all three annotators disagree with each other (i.e., no majority exists), we will use 2 as the aggregated result. In some cases, annotators disagree on wether the argument is argumentative. We discard instances where such disagreement occurs.

In [None]:
def majority_vote(instance: pd.Series) -> int | None:
    if len(instance) != 3:
        return None

    counts = instance.value_counts()

    if counts.iloc[0] == 1:
        return 2

    return counts.index[0]

In [None]:
AGGREGATED = (DATASET
              .drop(columns=["Annotator", "Argumentative"])
              .groupby(["ID", "Issue", "Stance", "Argument"])
              .aggregate(majority_vote)
              .dropna()
              .astype(int)
              .reset_index())

AGGREGATED

Unnamed: 0,ID,Issue,Stance,Argument,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV
0,1191878965,is-porn-wrong,yes-porn-is-wrong,hells yeah porn is wrong. i mean wtf. film nak...,2,1,3,2,2,1,2,1,1,2,1,1,2,1,1
1,12365,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,(I am writing this through Firefox) Emotions a...,1,2,3,1,1,2,2,3,2,2,2,2,1,1,2
2,12367,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,Firefox (and the extensions) leaks memory like...,1,2,3,1,1,2,2,2,2,2,2,2,2,1,1
3,12371,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,"I was a IE user from the beginning, but recent...",1,2,3,1,1,1,1,2,2,2,1,2,2,1,1
4,12380,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,I'm not an IE user but FFX has a lot of issues...,2,2,3,2,1,2,2,1,2,1,2,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,arg636360,christianity-or-atheism,christianity,One of the key component to Christianity is FA...,1,2,3,1,1,1,1,2,3,2,1,1,3,1,1
299,arg644073,christianity-or-atheism,christianity,Christianity does offer hope in the world. Chr...,1,2,2,1,1,1,3,2,2,1,1,2,2,1,1
300,arg649666,india-has-the-potential-to-lead-the-world,no-against,India has Strong Military Neighbors (China & P...,1,2,2,2,1,2,2,2,2,1,1,2,2,1,1
301,arg660921,is-the-school-uniform-a-good-or-bad-idea,good,"yas,of course . School uniform is important <b...",1,1,2,1,1,1,2,1,1,1,1,1,2,1,1


In [None]:
AGGREGATED.to_csv("data_preprocess.csv")

### Data Augmentation

#### Remove HTML tags mixed into the argument

Through inspection, it seems that some arguments contain HTML tags such as `<br/>`. To prevent causing trouble for the NLP components, we will preemptively remove those HTML tags.

In [None]:
# determine the kinds of HTML tags present in the arguments.
AGGREGATED["Argument"].str.findall(f"<[^>]+>").explode().value_counts()

<br/>    162
Name: Argument, dtype: int64

In [None]:
AGGREGATED["Argument"] = (AGGREGATED["Argument"]
                          .str.replace("<br/>", " ", regex=False)
                          .str.replace(r"\s+", " ", regex=True))

AGGREGATED

Unnamed: 0,ID,Issue,Stance,Argument,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV
0,1191878965,is-porn-wrong,yes-porn-is-wrong,hells yeah porn is wrong. i mean wtf. film nak...,2,1,3,2,2,1,2,1,1,2,1,1,2,1,1
1,12365,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,(I am writing this through Firefox) Emotions a...,1,2,3,1,1,2,2,3,2,2,2,2,1,1,2
2,12367,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,Firefox (and the extensions) leaks memory like...,1,2,3,1,1,2,2,2,2,2,2,2,2,1,1
3,12371,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,"I was a IE user from the beginning, but recent...",1,2,3,1,1,1,1,2,2,2,1,2,2,1,1
4,12380,firefox-vs-internet-explorer,there-s-more-browsers-than-the-ie-firefox-is-a...,I'm not an IE user but FFX has a lot of issues...,2,2,3,2,1,2,2,1,2,1,2,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,arg636360,christianity-or-atheism,christianity,One of the key component to Christianity is FA...,1,2,3,1,1,1,1,2,3,2,1,1,3,1,1
299,arg644073,christianity-or-atheism,christianity,Christianity does offer hope in the world. Chr...,1,2,2,1,1,1,3,2,2,1,1,2,2,1,1
300,arg649666,india-has-the-potential-to-lead-the-world,no-against,India has Strong Military Neighbors (China & P...,1,2,2,2,1,2,2,2,2,1,1,2,2,1,1
301,arg660921,is-the-school-uniform-a-good-or-bad-idea,good,"yas,of course . School uniform is important 1....",1,1,2,1,1,1,2,1,1,1,1,1,2,1,1


#### Remove `-` from issue and stance

The use of `-` may complicate future NLP processing for issue and stance field. We will replace `-` with a space.

In [None]:
AGGREGATED["Issue"] = AGGREGATED["Issue"].str.replace("-", " ", regex=False)
AGGREGATED["Stance"] = AGGREGATED["Stance"].str.replace("-", " ", regex=False)

AGGREGATED

Unnamed: 0,ID,Issue,Stance,Argument,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV
0,1191878965,is porn wrong,yes porn is wrong,hells yeah porn is wrong. i mean wtf. film nak...,2,1,3,2,2,1,2,1,1,2,1,1,2,1,1
1,12365,firefox vs internet explorer,there s more browsers than the ie firefox is a...,(I am writing this through Firefox) Emotions a...,1,2,3,1,1,2,2,3,2,2,2,2,1,1,2
2,12367,firefox vs internet explorer,there s more browsers than the ie firefox is a...,Firefox (and the extensions) leaks memory like...,1,2,3,1,1,2,2,2,2,2,2,2,2,1,1
3,12371,firefox vs internet explorer,there s more browsers than the ie firefox is a...,"I was a IE user from the beginning, but recent...",1,2,3,1,1,1,1,2,2,2,1,2,2,1,1
4,12380,firefox vs internet explorer,there s more browsers than the ie firefox is a...,I'm not an IE user but FFX has a lot of issues...,2,2,3,2,1,2,2,1,2,1,2,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,arg636360,christianity or atheism,christianity,One of the key component to Christianity is FA...,1,2,3,1,1,1,1,2,3,2,1,1,3,1,1
299,arg644073,christianity or atheism,christianity,Christianity does offer hope in the world. Chr...,1,2,2,1,1,1,3,2,2,1,1,2,2,1,1
300,arg649666,india has the potential to lead the world,no against,India has Strong Military Neighbors (China & P...,1,2,2,2,1,2,2,2,2,1,1,2,2,1,1
301,arg660921,is the school uniform a good or bad idea,good,"yas,of course . School uniform is important 1....",1,1,2,1,1,1,2,1,1,1,1,1,2,1,1


## Natural Language Processing

In [None]:
import stanza

In [None]:
nlp = stanza.Pipeline("en", verbose=False, device="cuda")

AGGREGATED["Argument"] = AGGREGATED["Argument"].astype("category")
AGGREGATED["Document"] = AGGREGATED["Argument"].apply(nlp)

AGGREGATED

KeyboardInterrupt: 

### Plain Tokenization

Following the typical NLP processing pipeline, we will begin by tokenizing each arguments.

In [None]:
def plain_tokenization(document: stanza.Document) -> list:
    return [word for sentence in document.sentences for word in sentence.words]

In [None]:
AGGREGATED["Document"] = AGGREGATED["Document"].astype(object)
AGGREGATED["Plain Words"] = AGGREGATED["Document"].apply(plain_tokenization)

AGGREGATED

Unnamed: 0,ID,Issue,Stance,Argument,CO,LA,LR,LS,EF,CR,...,CL,AP,AR,RE,GA,GR,GS,OV,Document,Plain Words
0,1191878965,is porn wrong,yes porn is wrong,hells yeah porn is wrong. i mean wtf. film nak...,2,1,3,2,2,1,...,1,1,2,1,1,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""hells"",\n ""lemma"":..."
1,12365,firefox vs internet explorer,there s more browsers than the ie firefox is a...,(I am writing this through Firefox) Emotions a...,1,2,3,1,1,2,...,3,2,2,2,2,1,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""("",\n ""lemma"": ""(""..."
2,12367,firefox vs internet explorer,there s more browsers than the ie firefox is a...,Firefox (and the extensions) leaks memory like...,1,2,3,1,1,2,...,2,2,2,2,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Firefox"",\n ""lemma..."
3,12371,firefox vs internet explorer,there s more browsers than the ie firefox is a...,"I was a IE user from the beginning, but recent...",1,2,3,1,1,1,...,2,2,2,1,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
4,12380,firefox vs internet explorer,there s more browsers than the ie firefox is a...,I'm not an IE user but FFX has a lot of issues...,2,2,3,2,1,2,...,1,2,1,2,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,arg636360,christianity or atheism,christianity,One of the key component to Christianity is FA...,1,2,3,1,1,1,...,2,3,2,1,1,3,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""One"",\n ""lemma"": ""..."
299,arg644073,christianity or atheism,christianity,Christianity does offer hope in the world. Chr...,1,2,2,1,1,1,...,2,2,1,1,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Christianity"",\n ""..."
300,arg649666,india has the potential to lead the world,no against,India has Strong Military Neighbors (China & P...,1,2,2,2,1,2,...,2,2,1,1,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""India"",\n ""lemma"":..."
301,arg660921,is the school uniform a good or bad idea,good,"yas,of course . School uniform is important 1....",1,1,2,1,1,1,...,1,1,1,1,1,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""yas"",\n ""lemma"": ""..."


In [None]:
AGGREGATED["Document"].astype(object)

0      [\n  [\n    {\n      "id": 1,\n      "text": "...
1      [\n  [\n    {\n      "id": 1,\n      "text": "...
2      [\n  [\n    {\n      "id": 1,\n      "text": "...
3      [\n  [\n    {\n      "id": 1,\n      "text": "...
4      [\n  [\n    {\n      "id": 1,\n      "text": "...
                             ...                        
298    [\n  [\n    {\n      "id": 1,\n      "text": "...
299    [\n  [\n    {\n      "id": 1,\n      "text": "...
300    [\n  [\n    {\n      "id": 1,\n      "text": "...
301    [\n  [\n    {\n      "id": 1,\n      "text": "...
302    [\n  [\n    {\n      "id": 1,\n      "text": "...
Name: Document, Length: 303, dtype: object

### Fancy Tokenization

To (potentially) help with the generalizability of our analysis, we will group semantically similar concepts appearing inside the argument. The grouping will be done on Named Entities (NEs) using the Named Entity Recognition (NER) feature of `stanza`. We will replace each NE's token with their entity type (e.g., `PER` for person, `LOC` for location, and `QUANTITY` for quantity.

```text
Steve Jobs has seven apples.

[PER] has [QUANTITY] apples.
```

In [None]:
def fancy_tokenization(document: stanza.Document) -> list:
    # extract all the entities
    entities = []

    for sentence in document.sentences:
        for entity in sentence.entities:
            entities.append(((entity.start_char, entity.end_char),
                             entity.type))

    # extract all the tokens
    tokens = [word for sentence in document.sentences for word in sentence.words]

    if len(entities) == 0:
        return tokens

    # group and replace all tokens that are part of a named entity with type
    fancy, (span, type) = [], entities.pop(0)

    for index, token in enumerate(tokens):
        if token.start_char < span[0]:
            fancy.append(token)
        elif token.start_char >= span[0] and token.end_char <= span[1]:
            pass
        else:
            fancy.append(type)

            if len(entities) != 0:
                span, type = entities.pop(0)
                fancy.append(token)
            else:
                fancy.extend(tokens[index:])
                break

    return fancy

In [None]:
AGGREGATED["Fancy Words"] = AGGREGATED["Document"].apply(fancy_tokenization)

AGGREGATED

Unnamed: 0,ID,Issue,Stance,Argument,CO,LA,LR,LS,EF,CR,...,AP,AR,RE,GA,GR,GS,OV,Document,Plain Words,Fancy Words
0,1191878965,is porn wrong,yes porn is wrong,hells yeah porn is wrong. i mean wtf. film nak...,2,1,3,2,2,1,...,1,2,1,1,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""hells"",\n ""lemma"":...","[{\n ""id"": 1,\n ""text"": ""hells"",\n ""lemma"":..."
1,12365,firefox vs internet explorer,there s more browsers than the ie firefox is a...,(I am writing this through Firefox) Emotions a...,1,2,3,1,1,2,...,2,2,2,2,1,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""("",\n ""lemma"": ""(""...","[{\n ""id"": 1,\n ""text"": ""("",\n ""lemma"": ""(""..."
2,12367,firefox vs internet explorer,there s more browsers than the ie firefox is a...,Firefox (and the extensions) leaks memory like...,1,2,3,1,1,2,...,2,2,2,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Firefox"",\n ""lemma...","[PRODUCT, {\n ""id"": 2,\n ""text"": ""("",\n ""le..."
3,12371,firefox vs internet explorer,there s more browsers than the ie firefox is a...,"I was a IE user from the beginning, but recent...",1,2,3,1,1,1,...,2,2,1,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
4,12380,firefox vs internet explorer,there s more browsers than the ie firefox is a...,I'm not an IE user but FFX has a lot of issues...,2,2,3,2,1,2,...,2,1,2,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,arg636360,christianity or atheism,christianity,One of the key component to Christianity is FA...,1,2,3,1,1,1,...,3,2,1,1,3,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""One"",\n ""lemma"": ""...","[CARDINAL, {\n ""id"": 2,\n ""text"": ""of"",\n ""..."
299,arg644073,christianity or atheism,christianity,Christianity does offer hope in the world. Chr...,1,2,2,1,1,1,...,2,1,1,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Christianity"",\n ""...","[NORP, {\n ""id"": 2,\n ""text"": ""does"",\n ""le..."
300,arg649666,india has the potential to lead the world,no against,India has Strong Military Neighbors (China & P...,1,2,2,2,1,2,...,2,1,1,2,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""India"",\n ""lemma"":...","[GPE, {\n ""id"": 2,\n ""text"": ""has"",\n ""lemm..."
301,arg660921,is the school uniform a good or bad idea,good,"yas,of course . School uniform is important 1....",1,1,2,1,1,1,...,1,1,1,1,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""yas"",\n ""lemma"": ""...","[{\n ""id"": 1,\n ""text"": ""yas"",\n ""lemma"": ""..."


### Random majority baseline

In [None]:

AGGREGATED = pd.read_csv("aggregated.csv", index_col=0, converters={"Document": json.loads})
# majority baseline
from statistics import mean
avg_list = []
for dimension in AGGREGATED.columns[4:4 + 15]:
    print(dimension, AGGREGATED[dimension].value_counts(normalize=True).max())
    avg_list.append(AGGREGATED[dimension].value_counts(normalize=True).max())
mean(avg_list)

CO 0.49174917491749176
LA 0.5577557755775577
LR 0.5115511551155115
LS 0.5643564356435643
EF 0.6039603960396039
CR 0.6567656765676567
EM 0.7755775577557755
CL 0.6270627062706271
AP 0.6435643564356436
AR 0.6204620462046204
RE 0.5247524752475248
GA 0.5313531353135313
GR 0.5478547854785478
GS 0.759075907590759
OV 0.49834983498349833


0.5942794279427943

In [None]:
#AGGREGATED.to_csv("aggregated.csv")

## Statistical Learning and Analysis

In [None]:
from scipy.sparse import hstack

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.svm import SVC

import matplotlib.pyplot as plt

In [None]:
TRAIN_DATASET, TEST_DATASET = train_test_split(AGGREGATED, train_size=0.8, random_state=621)

In [None]:
TRAIN_DATASET

Unnamed: 0,ID,Issue,Stance,Argument,CO,LA,LR,LS,EF,CR,...,AP,AR,RE,GA,GR,GS,OV,Document,Plain Words,Fancy Words
231,arg33342,if your spouse committed murder and he or she ...,yes,"I would have to say yes. She committed murder,...",2,2,3,2,1,2,...,2,2,2,2,2,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
177,arg238468,is the school uniform a good or bad idea,bad,I have always maintained that it is a good ide...,2,2,3,2,2,2,...,2,2,2,3,2,2,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
281,arg439197,christianity or atheism,atheism,I'm sad to see the way this turned out. With n...,2,2,3,1,2,1,...,2,2,2,2,2,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
190,arg312577,should physical education be mandatory in schools,no,P.E. should be optional in 8th grade. If stude...,1,2,2,1,1,2,...,2,1,1,1,1,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""P.E."",\n ""lemma"": ...","[{\n ""id"": 1,\n ""text"": ""P.E."",\n ""lemma"": ..."
135,arg168836,william farquhar ought to be honoured as the r...,no it is raffles,Farquhar has a boss!(raffles) he has to follow...,1,1,2,1,1,1,...,1,1,1,1,1,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Farquhar"",\n ""lemm...","[PERSON, {\n ""id"": 2,\n ""text"": ""has"",\n ""l..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,arg223675,tv is better than books,books,I think books are better as TV can cause obesi...,3,2,3,3,2,2,...,3,2,3,3,3,2,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""...","[{\n ""id"": 1,\n ""text"": ""I"",\n ""lemma"": ""I""..."
133,arg168834,william farquhar ought to be honoured as the r...,yes of course,"Raffles doesn't care about the citizens, doesn...",2,2,3,2,2,2,...,2,2,2,2,3,2,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Raffles"",\n ""lemma...","[PERSON, {\n ""id"": 2,\n ""text"": ""does"",\n ""..."
66,65125,firefox vs internet explorer,there s more browsers than the ie firefox is a...,Chorme beats all its the fastest and the safes...,1,1,1,1,1,1,...,1,1,1,1,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Chorme"",\n ""lemma""...","[{\n ""id"": 1,\n ""text"": ""Chorme"",\n ""lemma""..."
199,arg33099,personal pursuit or advancing the common good,advancing the common good,advancing the common good is better to me beca...,1,2,2,1,1,2,...,2,1,1,2,1,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""advancing"",\n ""lem...","[{\n ""id"": 1,\n ""text"": ""advancing"",\n ""lem..."


In [None]:
TEST_DATASET

Unnamed: 0,ID,Issue,Stance,Argument,CO,LA,LR,LS,EF,CR,...,AP,AR,RE,GA,GR,GS,OV,Document,Plain Words,Fancy Words
160,arg219245,ban plastic water bottles,no bad for the economy,U.S. alone grew by over 13%. According to rese...,2,3,3,1,1,2,...,2,2,2,3,2,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""U.S."",\n ""lemma"": ...","[GPE, {\n ""id"": 2,\n ""text"": ""alone"",\n ""le..."
254,arg35584,is it better to have a lousy father or to be f...,lousy father,It's better to have a father than to not have ...,2,3,3,2,2,2,...,2,2,2,2,2,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""It"",\n ""lemma"": ""i...","[{\n ""id"": 1,\n ""text"": ""It"",\n ""lemma"": ""i..."
150,arg213555,tv is better than books,books,Books enlighten the soul. Books don't destroy ...,1,1,2,1,1,2,...,2,2,1,1,1,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Books"",\n ""lemma"":...","[{\n ""id"": 1,\n ""text"": ""Books"",\n ""lemma"":..."
250,arg336222,human growth and development should parents us...,yes,Humans have been raised for thousands of years...,1,1,2,1,1,1,...,2,1,1,1,2,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Humans"",\n ""lemma""...","[{\n ""id"": 1,\n ""text"": ""Humans"",\n ""lemma""..."
200,arg33105,personal pursuit or advancing the common good,personal pursuit,it is better to help yourself before you can h...,2,2,2,2,1,1,...,2,2,2,2,2,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""it"",\n ""lemma"": ""i...","[{\n ""id"": 1,\n ""text"": ""it"",\n ""lemma"": ""i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,arg216634,should physical education be mandatory in schools,no,P.E doesn't help fat kids the fat kids just ea...,1,1,1,1,1,1,...,2,1,1,1,1,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""P.E"",\n ""lemma"": ""...","[{\n ""id"": 1,\n ""text"": ""P.E"",\n ""lemma"": ""..."
235,arg334943,human growth and development should parents us...,no,No because spanking might effects the relation...,1,2,2,1,2,1,...,2,2,2,2,1,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""No"",\n ""lemma"": ""n...","[{\n ""id"": 1,\n ""text"": ""No"",\n ""lemma"": ""n..."
164,arg219259,ban plastic water bottles,no bad for the economy,Bottled water is somewhat less likely to be fo...,2,2,3,2,2,2,...,3,2,2,2,3,1,2,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Bottled"",\n ""lemma...","[{\n ""id"": 1,\n ""text"": ""Bottled"",\n ""lemma..."
166,arg219268,ban plastic water bottles,no bad for the economy,Plastic is good light weight and a good for di...,1,1,2,1,1,1,...,1,1,1,1,1,1,1,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[{\n ""id"": 1,\n ""text"": ""Plastic"",\n ""lemma...","[{\n ""id"": 1,\n ""text"": ""Plastic"",\n ""lemma..."


### Frequency Based Features

In this part, we will investigate the effectiveness of frequency based features in classifying an argument's quality along each dimension. We will investigate the effect of three different approaches (and their combinations):

1. _Plain vs. Fancy Tokenization_: Compared to the plain tokenization, fancy tokenization abstract away from the surface text by treating semantically similar items as the same.
2. _Word_ vs. _Lemma_: A word's lemma helps abstract away from its grammatical features (e.g., the lemma for "dogs" is "dog").
3. _With_ vs. _Without POS_: Part-of-Speech is the ultimate abstraction from word.

In [None]:
def words_to_string(tokens: list) -> str:
    return " ".join(i if isinstance(i, str) else i.text for i in tokens)

def lemmas_to_string(tokens: list) -> str:
    return " ".join(i if isinstance(i, str) else (i.lemma or i.text) for i in tokens)

def poses_to_string(tokens: list) -> str:
    return " ".join(i.upos for i in tokens if not isinstance(i, str))

In [None]:
performances = pd.DataFrame(index=pd.MultiIndex.from_product(
    [["logistic", "svm", "random forest"], ["train", "test"], ["P", "F"], [False, True], [False, True]],
    names=["method", "section", "token", "lemma", "pos"])
)

for method in ["logistic", "svm", "random forest"]:
    for token in ["P", "F"]:
        for lemma in [False, True]:
            for pos in [False, True]:
                # get the appropriate columns and to string functions
                Classifier = {
                    "logistic": LogisticRegression,
                    "svm": SVC,
                    "random forest": RandomForestClassifier
                }[method]

                train_column = TRAIN_DATASET["Plain Words" if token == "P" else "Fancy Words"]
                test_column = TEST_DATASET["Plain Words" if token == "P" else "Fancy Words"]

                tok_to_string = lemmas_to_string if lemma else words_to_string

                # create the TF-IDF vectorizer
                tok_vectorizer = TfidfVectorizer(tokenizer=str.split, stop_words="english")

                train_x = tok_vectorizer.fit_transform(train_column.apply(tok_to_string))
                test_x = tok_vectorizer.transform(test_column.apply(tok_to_string))

                # attach POS feature if needed
                if pos:
                    pos_vectorizer = CountVectorizer(tokenizer=str.split)
                    transformer = QuantileTransformer()

                    train_x = hstack((train_x, transformer.fit_transform(pos_vectorizer.fit_transform(train_column.apply(poses_to_string)))))
                    test_x = hstack((test_x, transformer.transform(pos_vectorizer.transform(test_column.apply(poses_to_string)))))

                # training and evaluating
                for dimension in AGGREGATED.columns[4:4 + 15]:
                    model = Classifier()
                    model.fit(train_x, TRAIN_DATASET[dimension])

                    train_score = model.score(train_x, TRAIN_DATASET[dimension])
                    test_score = model.score(test_x, TEST_DATASET[dimension])

                    performances.loc[(method, "train", token, lemma, pos), dimension] = train_score
                    performances.loc[(method, "test", token, lemma, pos), dimension] = test_score

performances["AVG"] = performances.mean(axis=1)

  performances.loc[(method, "train", token, lemma, pos), dimension] = train_score
  performances.loc[(method, "test", token, lemma, pos), dimension] = test_score
  performances.loc[(method, "train", token, lemma, pos), dimension] = train_score
  performances.loc[(method, "test", token, lemma, pos), dimension] = test_score
  performances.loc[(method, "train", token, lemma, pos), dimension] = train_score
  performances.loc[(method, "test", token, lemma, pos), dimension] = test_score
  performances.loc[(method, "train", token, lemma, pos), dimension] = train_score
  performances.loc[(method, "test", token, lemma, pos), dimension] = test_score
  performances.loc[(method, "train", token, lemma, pos), dimension] = train_score
  performances.loc[(method, "test", token, lemma, pos), dimension] = test_score
  performances.loc[(method, "train", token, lemma, pos), dimension] = train_score
  performances.loc[(method, "test", token, lemma, pos), dimension] = test_score
  performances.loc[(method, 

In [None]:
performances.xs("train", level=1).style.format("{:.2%}").background_gradient("Blues", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV,AVG
method,token,lemma,pos,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
logistic,P,False,False,90.91%,76.03%,88.43%,92.98%,91.32%,89.67%,78.93%,73.97%,75.62%,84.71%,90.08%,83.88%,75.62%,82.64%,90.08%,84.33%
logistic,P,False,True,88.43%,84.71%,89.26%,88.84%,88.02%,90.50%,88.02%,81.40%,83.47%,86.36%,90.50%,86.36%,85.54%,90.08%,90.08%,87.44%
logistic,P,True,False,90.08%,74.38%,88.02%,90.91%,88.84%,90.50%,78.51%,71.49%,75.21%,83.06%,89.67%,80.17%,71.07%,82.23%,90.08%,82.95%
logistic,P,True,True,87.19%,82.64%,88.84%,88.84%,86.78%,89.67%,87.60%,81.40%,83.47%,84.30%,91.74%,85.54%,85.95%,90.50%,89.26%,86.91%
logistic,F,False,False,90.91%,76.86%,88.02%,92.98%,92.15%,89.67%,78.93%,73.97%,74.79%,85.12%,90.08%,81.40%,73.97%,82.64%,90.50%,84.13%
logistic,F,False,True,87.19%,82.23%,89.26%,89.26%,88.02%,89.67%,88.02%,81.40%,82.64%,85.54%,90.50%,86.36%,86.36%,90.50%,90.08%,87.13%
logistic,F,True,False,90.08%,75.62%,86.78%,90.50%,88.43%,89.67%,78.51%,71.90%,73.14%,82.64%,89.26%,80.17%,70.25%,82.64%,89.67%,82.62%
logistic,F,True,True,86.78%,82.64%,88.02%,88.43%,87.19%,89.26%,86.78%,81.82%,82.23%,85.12%,91.74%,85.95%,85.95%,90.08%,89.67%,86.78%
svm,P,False,False,94.21%,95.87%,95.87%,95.04%,95.45%,97.52%,92.56%,95.87%,95.87%,92.15%,94.21%,98.76%,98.35%,97.93%,92.98%,95.51%
svm,P,False,True,74.79%,63.22%,77.69%,82.64%,76.03%,80.58%,78.10%,59.92%,66.53%,71.49%,71.07%,64.46%,60.33%,76.45%,75.21%,71.90%


In [None]:
test_plot = performances.xs("test", level=1).style.format("{:.2%}").background_gradient("Greens", axis=0)
with open('statistical_approach_test.html', 'w') as f:
    f.write(test_plot.render())

  f.write(test_plot.render())


In [None]:
performances.xs("test", level=1).sort_values("AVG").style.format("{:.2%}").background_gradient("Greens", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV,AVG
method,token,lemma,pos,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
random forest,F,False,False,55.74%,50.82%,62.30%,62.30%,72.13%,52.46%,75.41%,73.77%,70.49%,68.85%,59.02%,50.82%,50.82%,81.97%,49.18%,62.40%
logistic,P,False,True,55.74%,45.90%,60.66%,72.13%,70.49%,55.74%,72.13%,65.57%,62.30%,70.49%,65.57%,59.02%,49.18%,80.33%,60.66%,63.06%
svm,P,False,True,55.74%,47.54%,55.74%,67.21%,68.85%,52.46%,75.41%,73.77%,73.77%,70.49%,60.66%,52.46%,50.82%,80.33%,62.30%,63.17%
svm,P,True,True,55.74%,47.54%,57.38%,67.21%,68.85%,52.46%,75.41%,73.77%,73.77%,70.49%,62.30%,52.46%,50.82%,80.33%,59.02%,63.17%
random forest,P,False,False,60.66%,45.90%,59.02%,65.57%,68.85%,54.10%,75.41%,73.77%,72.13%,65.57%,65.57%,57.38%,50.82%,80.33%,52.46%,63.17%
random forest,P,True,False,62.30%,49.18%,60.66%,70.49%,70.49%,52.46%,75.41%,73.77%,72.13%,63.93%,59.02%,52.46%,49.18%,80.33%,57.38%,63.28%
svm,F,True,True,55.74%,47.54%,52.46%,68.85%,72.13%,54.10%,75.41%,73.77%,73.77%,68.85%,63.93%,50.82%,50.82%,80.33%,62.30%,63.39%
logistic,P,True,True,57.38%,49.18%,63.93%,72.13%,68.85%,55.74%,73.77%,65.57%,62.30%,70.49%,63.93%,60.66%,49.18%,78.69%,60.66%,63.50%
logistic,F,False,False,60.66%,47.54%,59.02%,67.21%,68.85%,52.46%,75.41%,70.49%,73.77%,68.85%,60.66%,52.46%,49.18%,85.25%,60.66%,63.50%
svm,F,False,True,55.74%,47.54%,52.46%,68.85%,73.77%,54.10%,75.41%,73.77%,73.77%,68.85%,62.30%,50.82%,50.82%,80.33%,63.93%,63.50%


### We only consider adjactive of each argument 

In [None]:
def adj_to_string(x):
    return " ".join(word.text.lower() for sentence in x.sentences for word in sentence.words if word.upos == "ADJ")
sentences = AGGREGATED["Document"].apply(adj_to_string)
sentences

0      wrong naked much stupid wrong younger uncomfor...
1                                     unusable least new
2                                                       
3      obvious avaible other significant different be...
4                   old more less such young better best
                             ...                        
298                                             key same
299                                                     
300    strong military bad territorial tense due terr...
301                          important positive negative
302                 mandatory active involved good other
Name: Document, Length: 303, dtype: object

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
vectorizer = CountVectorizer(tokenizer=str.split, min_df=5)
X = vectorizer.fit_transform(sentences)
Y = AGGREGATED["EM"]




In [None]:
model = LogisticRegression()
model.fit(X, Y)

model.score(X, Y)

0.7953795379537953

In [None]:
model.coef_.shape

(3, 51)

In [None]:
level_1 = model.coef_[0]
level_2 = model.coef_[1]
level_3 = model.coef_[2]

In [None]:
Y.value_counts(normalize=True)

2    0.775578
1    0.155116
3    0.069307
Name: EM, dtype: float64

In [None]:
adj = [word for word, _ in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])]

adj_weight = dict(zip(adj, level_3))
sorted(adj_weight.items(), key=lambda x: x[1])

[('right', -0.5724581107304583),
 ('much', -0.4893313943945828),
 ('best', -0.4399044782956019),
 ('other', -0.43936750426715143),
 ('married', -0.40599306427492954),
 ('bad', -0.3766454188809203),
 ('different', -0.35504787381739766),
 ('scientific', -0.29622515253204773),
 ('common', -0.2951293714246473),
 ('unborn', -0.2744693274951817),
 ('human', -0.24112641215279956),
 ('little', -0.24098577795741882),
 ('long', -0.23024469631108688),
 ('equal', -0.21479448522851072),
 ('such', -0.20833531006904674),
 ('least', -0.2002910428069758),
 ('new', -0.19498927355869258),
 ('physical', -0.18101024371275906),
 ('certain', -0.17624995908995905),
 ('wrong', -0.17609355039560615),
 ('real', -0.1751677784175516),
 ('selfish', -0.16789554467514922),
 ('high', -0.12772461847577568),
 ('likely', -0.12741425599392156),
 ('easy', -0.125612495674967),
 ('able', -0.1248812268934567),
 ('easier', -0.1130872952504821),
 ('good', -0.103591517178394),
 ('mandatory', -0.08705690883111065),
 ('big', -0.06

### Evaluate argument quality based on different POS tags in arguments

In [None]:
def POS_tags_to_string(x, tag: str):
    return " ".join(word.text.lower() for sentence in x.sentences for word in sentence.words if word.upos == tag)


In [None]:
pos_performances = pd.DataFrame(index=pd.MultiIndex.from_product(
    [["logistic"], ["train", "test"], ["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"]],
    names=["method", "section", "pos"])
)

for method in ["logistic"]:
    # get the appropriate columns and to string functions
    Classifier = {
        "logistic": LogisticRegression
    }[method]

    # training and evaluating

    train_column = TRAIN_DATASET["Document"]
    test_column = TEST_DATASET["Document"]

    train_text = TRAIN_DATASET["Plain Words"]
    test_text = TEST_DATASET["Plain Words"]

    tok_vectorizer = TfidfVectorizer(tokenizer=str.split, stop_words="english")

    train_x = tok_vectorizer.fit_transform(train_text.apply(words_to_string))
    test_x = tok_vectorizer.transform(test_text.apply(words_to_string))

    for pos in ["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"]:
        # keep only needed POS tags if needed
        pos_vectorizer = CountVectorizer(tokenizer=str.split)
        transformer = QuantileTransformer()

        train_x = hstack((train_x, transformer.fit_transform(pos_vectorizer.fit_transform(train_column.apply(POS_tags_to_string, tag=pos)))))
        test_x = hstack((test_x, transformer.transform(pos_vectorizer.transform(test_column.apply(POS_tags_to_string, tag=pos)))))

        # training and evaluating
        for dimension in AGGREGATED.columns[4:4 + 15]:
            model = Classifier()
            model.fit(train_x, TRAIN_DATASET[dimension])

            train_score = model.score(train_x, TRAIN_DATASET[dimension])
            test_score = model.score(test_x, TEST_DATASET[dimension])

            pos_performances.loc[(method, "train", pos), dimension] = train_score
            pos_performances.loc[(method, "test", pos), dimension] = test_score

pos_performances["AVG"] = pos_performances.mean(axis=1)



In [None]:
AGGREGATED["Document"].apply(POS_tags_to_string, tag="ADJ")

0      wrong naked much stupid wrong younger uncomfor...
1                                     unusable least new
2                                                       
3      obvious avaible other significant different be...
4                   old more less such young better best
                             ...                        
298                                             key same
299                                                     
300    strong military bad territorial tense due terr...
301                          important positive negative
302                 mandatory active involved good other
Name: Document, Length: 303, dtype: object

In [None]:
pos_performances.xs("train", level=1).style.format("{:.2%}").background_gradient("Blues", axis=0)


Unnamed: 0_level_0,Unnamed: 1_level_0,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV,AVG
method,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
logistic,ADJ,99.17%,96.69%,96.28%,97.11%,98.35%,98.35%,95.87%,93.80%,95.04%,98.76%,99.17%,98.76%,96.28%,98.35%,98.76%,97.38%
logistic,ADV,98.76%,99.59%,99.17%,98.35%,98.76%,100.00%,97.93%,97.11%,98.76%,99.17%,99.59%,99.59%,98.76%,99.59%,99.17%,98.95%
logistic,INTJ,98.76%,100.00%,99.59%,98.76%,98.76%,100.00%,97.93%,97.52%,99.17%,98.76%,99.59%,99.59%,98.76%,99.59%,98.76%,99.04%
logistic,NOUN,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,99.59%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,99.97%
logistic,PROPN,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,99.59%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,99.97%
logistic,VERB,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%


In [None]:
pos_performances.xs("test", level=1).style.format("{:.2%}").background_gradient("Greens", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV,AVG
method,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
logistic,ADJ,65.57%,49.18%,59.02%,62.30%,63.93%,47.54%,75.41%,72.13%,77.05%,55.74%,50.82%,50.82%,47.54%,81.97%,55.74%,60.98%
logistic,ADV,68.85%,54.10%,59.02%,78.69%,73.77%,57.38%,73.77%,70.49%,65.57%,67.21%,55.74%,57.38%,42.62%,81.97%,68.85%,65.03%
logistic,INTJ,70.49%,54.10%,59.02%,80.33%,75.41%,57.38%,73.77%,73.77%,65.57%,63.93%,55.74%,57.38%,45.90%,81.97%,68.85%,65.57%
logistic,NOUN,67.21%,52.46%,60.66%,70.49%,70.49%,59.02%,77.05%,67.21%,62.30%,60.66%,60.66%,52.46%,47.54%,85.25%,59.02%,63.50%
logistic,PROPN,67.21%,49.18%,57.38%,72.13%,72.13%,59.02%,75.41%,65.57%,63.93%,63.93%,63.93%,50.82%,47.54%,81.97%,57.38%,63.17%
logistic,VERB,60.66%,59.02%,57.38%,77.05%,73.77%,57.38%,73.77%,65.57%,67.21%,63.93%,72.13%,54.10%,54.10%,81.97%,59.02%,65.14%


### Evaluate argument quality based on different pos tags

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
# Evaluate pos tags weights on argument quality dimension based on logistic regression

train_pos = TRAIN_DATASET["Plain Words"]
test_pos = TEST_DATASET["Plain Words"]

pos_vectorizer = CountVectorizer(tokenizer=str.split, min_df=0.6)

train_x = pos_vectorizer.fit_transform(train_pos.apply(poses_to_string))
test_x = pos_vectorizer.transform(test_pos.apply(poses_to_string))

pos_weight_dict = {}

for dimensions in AGGREGATED.columns[4:4 + 15]:
    model = LogisticRegression()
    model.fit(train_x, TRAIN_DATASET[dimensions])

    level_3 = model.coef_[2]
    pos = [word for word, _ in sorted(pos_vectorizer.vocabulary_.items(), key=lambda x: x[1])]

    pos_weight = dict(zip(pos, level_3))
    pos_weight_dict[dimensions] = sorted(pos_weight.items(), key=lambda x: x[1])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
# Normalize the weights for each dimension
df_pos_weight_dict = pd.DataFrame.from_dict({k: dict(v) for k, v in pos_weight_dict.items()}, orient="index")

In [None]:
df_pos_weight_dict.T.style.format("{:.2%}").background_gradient("Greens", axis=0)

Unnamed: 0,CO,LA,LR,LS,EF,CR,EM,CL,AP,AR,RE,GA,GR,GS,OV
adv,-9.54%,-6.82%,2.81%,1.64%,-1.45%,5.42%,-1.86%,-12.03%,2.98%,-22.76%,-19.68%,1.94%,-8.56%,43.94%,-16.86%
det,-4.71%,-3.17%,7.20%,-5.99%,12.79%,5.69%,-3.16%,19.72%,2.35%,22.74%,-5.11%,-2.19%,11.11%,11.03%,7.34%
part,-2.08%,0.21%,-0.66%,4.50%,6.10%,28.81%,-22.08%,-10.88%,-5.57%,-16.13%,4.87%,0.41%,-25.03%,22.70%,4.30%
pron,-0.23%,-6.26%,-4.67%,-1.29%,-3.31%,-11.88%,3.39%,2.84%,-3.21%,6.86%,-7.64%,-10.25%,-5.10%,-27.85%,-1.14%
verb,0.48%,7.06%,8.52%,-7.28%,-5.92%,-7.85%,8.46%,-11.78%,-4.93%,-10.73%,-1.81%,5.71%,9.35%,-28.38%,-6.08%
punct,2.87%,-1.22%,-8.39%,7.00%,1.86%,22.15%,5.21%,4.32%,2.69%,5.28%,1.54%,0.56%,0.08%,5.25%,6.60%
cconj,3.99%,8.34%,11.26%,20.17%,17.47%,22.38%,-4.32%,2.79%,9.11%,15.19%,29.05%,18.20%,17.74%,20.75%,13.46%
noun,4.43%,3.97%,3.87%,-0.29%,2.20%,-8.40%,-1.41%,1.55%,7.81%,0.80%,3.97%,4.68%,1.91%,9.44%,6.13%
adj,6.36%,2.17%,-3.04%,7.47%,-6.14%,1.34%,6.92%,0.54%,-7.82%,9.92%,-3.97%,-4.97%,3.64%,-15.91%,4.77%
sconj,6.71%,6.62%,23.20%,16.20%,1.75%,3.98%,-4.80%,21.29%,30.75%,2.85%,26.41%,18.73%,22.18%,-131.16%,16.63%
