In [330]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

In [343]:
path = "arguments-training.tsv"
df = pd.read_table(path, sep = "\t")
df.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise
0,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...
1,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...
2,A01006,We should end the use of economic sanctions,against,sometimes economic sanctions are the only thin...
3,A01007,We should abolish capital punishment,against,capital punishment is sometimes the only optio...
4,A01008,We should ban factory farming,against,factory farming allows for the production of c...


In [344]:
# renaming the Argument ID column so that there is no space
df.rename(columns={"Argument ID": "Argument_ID"}, inplace=True)

In [345]:
# we drop the row with no labels
df.drop(index=3358, inplace=True)

In [346]:
# reset the row indices
df=df.reset_index(drop=True)

In [335]:
nlp = spacy.load("en_core_web_sm")

In [336]:

def preprocess_text(text):
    doc = nlp(text)
    # set lowercase
    processed_text = [token.text.lower()for token in doc]
    return processed_text

In [337]:
# we pre-process the arguments with spacy
df["premise_tokenized"] = df["Premise"].apply(preprocess_text)
df.head()

Unnamed: 0,Argument_ID,Conclusion,Stance,Premise,premise_tokenized
0,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,"[we, should, ban, human, cloning, as, it, will..."
1,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,"[fast, food, should, be, banned, because, it, ..."
2,A01006,We should end the use of economic sanctions,against,sometimes economic sanctions are the only thin...,"[sometimes, economic, sanctions, are, the, onl..."
3,A01007,We should abolish capital punishment,against,capital punishment is sometimes the only optio...,"[capital, punishment, is, sometimes, the, only..."
4,A01008,We should ban factory farming,against,factory farming allows for the production of c...,"[factory, farming, allows, for, the, productio..."


In [338]:
documents = [[doc for doc in row.premise_tokenized] for idx, row in df.iterrows()]

In [339]:
from gensim.models.phrases import Phrases, Phraser

# Train a bigram model
bigram = Phrases(documents, min_count=20, threshold=100) 
# we tuned the parameters so that the bigrams are the most informative and formed mostly by full words
bigram_mod = Phraser(bigram)

# Apply the trained bigram model to each document
documents_with_bigrams = [bigram_mod[doc] for doc in documents]
documents_with_bigrams

2023-12-06 19:00:07,233 : INFO : collecting all words and their counts
2023-12-06 19:00:07,234 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2023-12-06 19:00:07,340 : INFO : collected 60123 token types (unigram + bigrams) from a corpus of 127374 words and 5392 sentences
2023-12-06 19:00:07,340 : INFO : merged Phrases<60123 vocab, min_count=20, threshold=100, max_vocab_size=40000000>
2023-12-06 19:00:07,341 : INFO : Phrases lifecycle event {'msg': 'built Phrases<60123 vocab, min_count=20, threshold=100, max_vocab_size=40000000> in 0.11s', 'datetime': '2023-12-06T19:00:07.341767', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-12-06 19:00:07,349 : INFO : exporting phrases from Phrases<60123 vocab, min_count=20, threshold=100, max_vocab_size=40000000>
2023-12-06 19:00:07,412 : INFO : FrozenPhrases lifecycle event {'msg': 'exported Froz

[['we',
  'should',
  'ban',
  'human_cloning',
  'as',
  'it',
  'will',
  'only',
  'cause',
  'huge',
  'issues',
  'when',
  'you',
  'have',
  'a',
  'bunch',
  'of',
  'the',
  'same',
  'humans',
  'running',
  'around',
  'all',
  'acting',
  'the',
  'same',
  '.'],
 ['fast_food',
  'should',
  'be',
  'banned',
  'because',
  'it',
  'is',
  'really',
  'bad',
  'for',
  'your',
  'health',
  'and',
  'is',
  'costly',
  '.'],
 ['sometimes',
  'economic_sanctions',
  'are',
  'the',
  'only',
  'thing',
  'that',
  'will',
  'get',
  'the',
  'corrupt',
  'governments',
  'to',
  'take',
  'action'],
 ['capital_punishment',
  'is',
  'sometimes',
  'the',
  'only',
  'option',
  'to',
  'keep',
  'criminals',
  'from',
  'committing',
  'more',
  'crimes',
  '.'],
 ['factory_farming',
  'allows',
  'for',
  'the',
  'production',
  'of',
  'cheap',
  'food',
  ',',
  'which',
  'is',
  'a',
  'necessity',
  'for',
  'families',
  'surviving',
  'on',
  'a',
  'low',
  'income

**To evaluate the doc2vec vectors we will compute the cosine similarity between the arguments with the same value, but first we must generalize the labels**

In [347]:
path = "labels-training.tsv"
labels_df = pd.read_table(path, sep = "\t")
labels_df.head()

Unnamed: 0,Argument ID,Self-direction: thought,Self-direction: action,Stimulation,Hedonism,Achievement,Power: dominance,Power: resources,Face,Security: personal,...,Tradition,Conformity: rules,Conformity: interpersonal,Humility,Benevolence: caring,Benevolence: dependability,Universalism: concern,Universalism: nature,Universalism: tolerance,Universalism: objectivity
0,A01002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A01005,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,A01006,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A01007,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,A01008,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0


In [348]:
# we drop the row with no labels
labels_df.drop(index=3358, inplace=True)

In [349]:
# reset the row indices
labels_df=labels_df.reset_index(drop=True)

In [350]:
# we generalize labels according to the following dictionary
general_dictionary={
    "Self-direction: thought":"Openness to change",
    "Self-direction: action":"Openness to change",
    "Stimulation":"Openness to change",
    "Hedonism":"Openness to change",
    "Achievement":"Self-Enhancement",
    "Power: dominance":"Self-Enhancement",
    "Power: resources":"Self-Enhancement",
    "Face":"Self-Enhancement",
    "Security: personal":"Conservation",
    "Security: societal":"Conservation",
    "Tradition":"Conservation",
    "Conformity: rules":"Conservation",
    "Conformity: interpersonal":"Conservation",
    "Humility":"Conservation",
    "Benevolence: caring":"Self-Transcendence",
    "Benevolence: dependability":"Self-Transcendence",
    "Universalism: concern":"Self-Transcendence",
    "Universalism: nature":"Self-Transcendence",
    "Universalism: tolerance":"Self-Transcendence",
    "Universalism: objectivity":"Self-Transcendence"
    
}

In [351]:
labels_df["Openness to change"]=labels_df["Self-direction: thought"]+labels_df["Self-direction: action"]+labels_df["Stimulation"]+labels_df["Hedonism"]

In [352]:
labels_df["Self-Enhancement"]=labels_df["Achievement"]+labels_df["Power: dominance"]+labels_df["Power: resources"]+labels_df["Face"]

In [353]:
labels_df["Conservation"]=labels_df["Security: personal"]+labels_df["Security: societal"]+labels_df["Tradition"]+labels_df["Conformity: rules"]+labels_df["Conformity: interpersonal"]+labels_df["Humility"]

In [354]:
labels_df["Self-Transcendence"]=labels_df["Benevolence: caring"]+labels_df["Benevolence: dependability"]+labels_df["Universalism: concern"]+labels_df["Universalism: nature"]+labels_df["Universalism: tolerance"]+labels_df["Universalism: objectivity"]

Now that the new features are created, the original ones can be dropped

In [355]:
for key in general_dictionary:
    if key!=general_dictionary[key]:
        labels_df=labels_df.drop(key, axis=1)
    else:
        pass

In [356]:
labels_df

Unnamed: 0,Argument ID,Openness to change,Self-Enhancement,Conservation,Self-Transcendence
0,A01002,0,0,1,0
1,A01005,0,0,1,0
2,A01006,0,1,1,0
3,A01007,0,0,2,1
4,A01008,0,0,1,2
...,...,...,...,...,...
5387,E08016,0,2,1,1
5388,E08017,0,0,2,3
5389,E08018,0,0,0,2
5390,E08019,0,0,3,3


In [357]:
# extract the generalized label columns and store them in a separate DataFrame
label_columns=labels_df[[
"Openness to change",
"Self-Enhancement",
"Conservation",
"Self-Transcendence"]]

# convert the label columns to a multi-class format (one-hot encoding)
labels_df['general_label'] = label_columns.apply(lambda row: row.to_list(), axis=1)
labels_df

Unnamed: 0,Argument ID,Openness to change,Self-Enhancement,Conservation,Self-Transcendence,general_label
0,A01002,0,0,1,0,"[0, 0, 1, 0]"
1,A01005,0,0,1,0,"[0, 0, 1, 0]"
2,A01006,0,1,1,0,"[0, 1, 1, 0]"
3,A01007,0,0,2,1,"[0, 0, 2, 1]"
4,A01008,0,0,1,2,"[0, 0, 1, 2]"
...,...,...,...,...,...,...
5387,E08016,0,2,1,1,"[0, 2, 1, 1]"
5388,E08017,0,0,2,3,"[0, 0, 2, 3]"
5389,E08018,0,0,0,2,"[0, 0, 0, 2]"
5390,E08019,0,0,3,3,"[0, 0, 3, 3]"


In [359]:
final=[]

for sublist in labels_df["general_label"]:
    sum_values=sum(sublist)
    probability_distribution=[value/sum_values for value in sublist]
    final.append(probability_distribution)

In [360]:
labels_df["general_label"]=final

In [361]:
labels_df

Unnamed: 0,Argument ID,Openness to change,Self-Enhancement,Conservation,Self-Transcendence,general_label
0,A01002,0,0,1,0,"[0.0, 0.0, 1.0, 0.0]"
1,A01005,0,0,1,0,"[0.0, 0.0, 1.0, 0.0]"
2,A01006,0,1,1,0,"[0.0, 0.5, 0.5, 0.0]"
3,A01007,0,0,2,1,"[0.0, 0.0, 0.6666666666666666, 0.3333333333333..."
4,A01008,0,0,1,2,"[0.0, 0.0, 0.3333333333333333, 0.6666666666666..."
...,...,...,...,...,...,...
5387,E08016,0,2,1,1,"[0.0, 0.5, 0.25, 0.25]"
5388,E08017,0,0,2,3,"[0.0, 0.0, 0.4, 0.6]"
5389,E08018,0,0,0,2,"[0.0, 0.0, 0.0, 1.0]"
5390,E08019,0,0,3,3,"[0.0, 0.0, 0.5, 0.5]"


Now we can implement a gridsearch to find the best parameters for doc2vec

In [431]:
from gensim.models import Doc2Vec

In [432]:
from gensim.models.doc2vec import TaggedDocument

In [362]:
from sklearn.model_selection import train_test_split

In [212]:
# we create a list of labels that represent our values per argument
values = [[l_values for l_values in row.general_label] for id, row in labels_df.iterrows()]

In [222]:
# we split into training and test to find the best params for doc2vec
train_docs, test_docs, train_values, test_values = train_test_split(documents_with_bigrams, values, test_size=0.2)

In [223]:

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to evaluate similarity
def evaluate_similarity(model, documents, values):
    """function that evaluates the mean similarity score, given a doc2vec model,
    of the vectors that have the same labels"""
    
    doc_vectors = [model.infer_vector(doc) for doc in documents]
    similarity_scores = []

    for i in range(len(documents)):
        for j in range(i + 1, len(documents)):
            if values[i] == values[j]:
                sim_score = cosine_similarity([doc_vectors[i]], [doc_vectors[j]])[0][0]
                similarity_scores.append(sim_score)

    return np.mean(similarity_scores)

In [224]:
# we implement a gridsearch for the parameters 
param_grid = {
    'vector_size': [50, 100, 200],
    'dm': [1, 0],
    'window': [5, 7, 10],
    'min_count': [5, 7, 9],
    'epochs': [10, 30, 50],
    "negative": [5, 10, 15]
}

In [226]:
def train_evaluate_doc2vec(params, train_docs, test_docs, test_values):
    """function that returns the evaluation of a doc2vec model, given the
    parameters, the documents for the training, the documents for testing
    and the test labels"""
    
    tagged_train_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_docs)]
    model = Doc2Vec(documents=tagged_train_docs,
                    vector_size=params['vector_size'],
                    dm = params["dm"], 
                    window=params['window'], 
                    min_count=params['min_count'], 
                    epochs=params['epochs'],
                    negative = params["negative"])

    return evaluate_similarity(model, test_docs, test_values)

In [227]:
from tqdm import tqdm

In [228]:
from itertools import product

best_score = 0
best_params = None

# Create all combinations of parameters
param_combinations = [dict(zip(param_grid.keys(), v)) for v in product(*param_grid.values())]

for params in tqdm(param_combinations):
    score = train_evaluate_doc2vec(params, train_docs, test_docs, test_values)
    if score > best_score:
        best_score = score
        best_params = params

print("Best Score:", best_score)
print("Best Parameters:", best_params)

  0%|          | 0/486 [00:00<?, ?it/s]2023-12-06 14:13:41,362 : INFO : collecting all words and their counts
2023-12-06 14:13:41,363 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-12-06 14:13:41,384 : INFO : collected 7125 word types and 4313 unique tags from a corpus of 4313 examples and 99524 words
2023-12-06 14:13:41,385 : INFO : Creating a fresh vocabulary
2023-12-06 14:13:41,390 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1966 unique words (27.59% of original 7125, drops 5159)', 'datetime': '2023-12-06T14:13:41.390055', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-12-06 14:13:41,391 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 90923 word corpus (91.36% of original 99524, drops 8601)', 'datetime': '2023-12-06T14:13:41.391054', 'gensim': '4.3.1', '

Best Score: 0.99911195
Best Parameters: {'vector_size': 200, 'dm': 0, 'window': 7, 'min_count': 5, 'epochs': 10, 'negative': 10}





Now we train the model with the best parameters

In [386]:
tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents_with_bigrams)]

In [388]:
# implementing the model with the best configuration of parametes
doc2vec_final = Doc2Vec(vector_size = 200,
                    dm = 0, 
                    window = 7, 
                    min_count= 5, 
                    epochs= 10,
                    negative = 10)

2023-12-06 19:07:13,158 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dbow,d200,n10,mc5,s0.001,t3>', 'datetime': '2023-12-06T19:07:13.158389', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [389]:
# building the vocabulary
doc2vec_final.build_vocab(tagged_docs)

2023-12-06 19:07:13,553 : INFO : collecting all words and their counts
2023-12-06 19:07:13,553 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-12-06 19:07:13,573 : INFO : collected 7871 word types and 5392 unique tags from a corpus of 5392 examples and 124966 words
2023-12-06 19:07:13,574 : INFO : Creating a fresh vocabulary
2023-12-06 19:07:13,580 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2275 unique words (28.90% of original 7871, drops 5596)', 'datetime': '2023-12-06T19:07:13.580047', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-12-06 19:07:13,581 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 115612 word corpus (92.51% of original 124966, drops 9354)', 'datetime': '2023-12-06T19:07:13.581048', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a45

In [390]:
# training the model
doc2vec_final.train(tagged_docs, total_examples=doc2vec_final.corpus_count, epochs=doc2vec_final.epochs)

2023-12-06 19:07:13,901 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 2275 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=10 window=7 shrink_windows=True', 'datetime': '2023-12-06T19:07:13.901084', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-12-06 19:07:14,059 : INFO : EPOCH 0: training on 124966 raw words (83737 effective words) took 0.2s, 544776 effective words/s
2023-12-06 19:07:14,212 : INFO : EPOCH 1: training on 124966 raw words (83826 effective words) took 0.1s, 562677 effective words/s
2023-12-06 19:07:14,364 : INFO : EPOCH 2: training on 124966 raw words (83706 effective words) took 0.1s, 567348 effective words/s
2023-12-06 19:07:14,521 : INFO : EPOCH 3: training on 124966 raw words (83753 effective words) took 0.2s, 543556 effective words/s
2023-12-06 19:07:14,676 : INFO : EPOCH 4: training on 124

In [419]:
%store -r doc_topics

In [421]:
# we make sure that the list of topic vectors and the list of document vectors have the same size
print(len(doc_topics))
print(len(tagged_docs))

5392
5392


In [426]:
# we transform the doc2vec object in a numpy array of vectors
doc2vec_vec = []
for i in range(len(tagged_docs)):
    doc2vec_vec.append(doc2vec_final[i])

doc2vec_vec = np.array(doc_topic_vec)

In [428]:
# we can now concatenate the topic probability vectors and the doc2vec vectors for each document

doc_topic_vec = np.array([np.concatenate([vec1, vec2]) for vec1, vec2 in zip(doc_topics, doc2vec_vec)])
len(doc_topic_vec)

5392

In [429]:
%store doc_topic_vec

Stored 'doc_topic_vec' (ndarray)


In [430]:
%store labels_df

Stored 'labels_df' (DataFrame)
