In [35]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

In [39]:
path = "arguments-training.tsv"
df = pd.read_table(path, sep = "\t")
df.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise
0,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...
1,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...
2,A01006,We should end the use of economic sanctions,against,sometimes economic sanctions are the only thin...
3,A01007,We should abolish capital punishment,against,capital punishment is sometimes the only optio...
4,A01008,We should ban factory farming,against,factory farming allows for the production of c...


In [44]:
# renaming the Argument ID column so that there is no space
df.rename(columns={"Argument ID": "Argument_ID"}, inplace=True)

In [40]:
nlp = spacy.load("en_core_web_sm")

In [41]:
def preprocess_text(text):
    doc = nlp(text)
    # set lowercase
    processed_text = [token.text.lower()for token in doc]
    return processed_text

In [42]:
df["premise_tokenized"] = df["Premise"].apply(preprocess_text)
df.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,premise_tokenized
0,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,"[we, should, ban, human, cloning, as, it, will..."
1,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,"[fast, food, should, be, banned, because, it, ..."
2,A01006,We should end the use of economic sanctions,against,sometimes economic sanctions are the only thin...,"[sometimes, economic, sanctions, are, the, onl..."
3,A01007,We should abolish capital punishment,against,capital punishment is sometimes the only optio...,"[capital, punishment, is, sometimes, the, only..."
4,A01008,We should ban factory farming,against,factory farming allows for the production of c...,"[factory, farming, allows, for, the, productio..."


In [49]:
documents = [[doc for doc in row.premise_tokenized] for idx, row in df.iterrows()]

In [60]:
from gensim.models.phrases import Phrases, Phraser

# Train a bigram model
bigram = Phrases(documents, min_count=20, threshold=100) 
# we tuned the parameters so that the bigrams are the most informative and formed mostly by full words
bigram_mod = Phraser(bigram)

# Apply the trained bigram model to each document
documents_with_bigrams = [bigram_mod[doc] for doc in documents]
documents_with_bigrams

2023-12-05 16:17:22,385 : INFO : collecting all words and their counts
2023-12-05 16:17:22,386 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2023-12-05 16:17:22,520 : INFO : collected 60132 token types (unigram + bigrams) from a corpus of 127401 words and 5393 sentences
2023-12-05 16:17:22,521 : INFO : merged Phrases<60132 vocab, min_count=20, threshold=100, max_vocab_size=40000000>
2023-12-05 16:17:22,522 : INFO : Phrases lifecycle event {'msg': 'built Phrases<60132 vocab, min_count=20, threshold=100, max_vocab_size=40000000> in 0.14s', 'datetime': '2023-12-05T16:17:22.522450', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-12-05 16:17:22,523 : INFO : exporting phrases from Phrases<60132 vocab, min_count=20, threshold=100, max_vocab_size=40000000>
2023-12-05 16:17:22,587 : INFO : FrozenPhrases lifecycle event {'msg': 'exported Froz

[['we',
  'should',
  'ban',
  'human_cloning',
  'as',
  'it',
  'will',
  'only',
  'cause',
  'huge',
  'issues',
  'when',
  'you',
  'have',
  'a',
  'bunch',
  'of',
  'the',
  'same',
  'humans',
  'running',
  'around',
  'all',
  'acting',
  'the',
  'same',
  '.'],
 ['fast_food',
  'should',
  'be',
  'banned',
  'because',
  'it',
  'is',
  'really',
  'bad',
  'for',
  'your',
  'health',
  'and',
  'is',
  'costly',
  '.'],
 ['sometimes',
  'economic_sanctions',
  'are',
  'the',
  'only',
  'thing',
  'that',
  'will',
  'get',
  'the',
  'corrupt',
  'governments',
  'to',
  'take',
  'action'],
 ['capital_punishment',
  'is',
  'sometimes',
  'the',
  'only',
  'option',
  'to',
  'keep',
  'criminals',
  'from',
  'committing',
  'more',
  'crimes',
  '.'],
 ['factory_farming',
  'allows',
  'for',
  'the',
  'production',
  'of',
  'cheap',
  'food',
  ',',
  'which',
  'is',
  'a',
  'necessity',
  'for',
  'families',
  'surviving',
  'on',
  'a',
  'low',
  'income

In [62]:
from gensim.models.doc2vec import TaggedDocument

In [61]:
# creating the ids for the documents from the Argument ID column
ids = []
for idx, row in df.iterrows():
    ids.append(row.Argument_ID)

# creating the data in the format specified by the gensim documentation
data = []
for doc, id in zip(documents_with_bigrams, ids):
    data.append(TaggedDocument(doc, id))
data

[TaggedDocument(words=['we', 'should', 'ban', 'human_cloning', 'as', 'it', 'will', 'only', 'cause', 'huge', 'issues', 'when', 'you', 'have', 'a', 'bunch', 'of', 'the', 'same', 'humans', 'running', 'around', 'all', 'acting', 'the', 'same', '.'], tags='A01002'),
 TaggedDocument(words=['fast_food', 'should', 'be', 'banned', 'because', 'it', 'is', 'really', 'bad', 'for', 'your', 'health', 'and', 'is', 'costly', '.'], tags='A01005'),
 TaggedDocument(words=['sometimes', 'economic_sanctions', 'are', 'the', 'only', 'thing', 'that', 'will', 'get', 'the', 'corrupt', 'governments', 'to', 'take', 'action'], tags='A01006'),
 TaggedDocument(words=['capital_punishment', 'is', 'sometimes', 'the', 'only', 'option', 'to', 'keep', 'criminals', 'from', 'committing', 'more', 'crimes', '.'], tags='A01007'),
 TaggedDocument(words=['factory_farming', 'allows', 'for', 'the', 'production', 'of', 'cheap', 'food', ',', 'which', 'is', 'a', 'necessity', 'for', 'families', 'surviving', 'on', 'a', 'low', 'income', '.

In [47]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [63]:
from gensim.models import Doc2Vec

In [72]:
doc2vec_dm = Doc2Vec(data, vector_size=100, window=10, min_count=5, dm=1)

2023-12-05 16:32:03,387 : INFO : collecting all words and their counts
2023-12-05 16:32:03,388 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-12-05 16:32:03,415 : INFO : collected 7871 word types and 13 unique tags from a corpus of 5393 examples and 124992 words
2023-12-05 16:32:03,416 : INFO : Creating a fresh vocabulary
2023-12-05 16:32:03,422 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2275 unique words (28.90% of original 7871, drops 5596)', 'datetime': '2023-12-05T16:32:03.422170', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-12-05 16:32:03,423 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 115638 word corpus (92.52% of original 124992, drops 9354)', 'datetime': '2023-12-05T16:32:03.423458', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f

In [73]:
doc2vec_dbow = Doc2Vec(data, vector_size=100, window=10, min_count=5, dm=0)

2023-12-05 16:32:11,273 : INFO : collecting all words and their counts
2023-12-05 16:32:11,274 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-12-05 16:32:11,314 : INFO : collected 7871 word types and 13 unique tags from a corpus of 5393 examples and 124992 words
2023-12-05 16:32:11,315 : INFO : Creating a fresh vocabulary
2023-12-05 16:32:11,325 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2275 unique words (28.90% of original 7871, drops 5596)', 'datetime': '2023-12-05T16:32:11.325029', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-12-05 16:32:11,326 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 115638 word corpus (92.52% of original 124992, drops 9354)', 'datetime': '2023-12-05T16:32:11.326538', 'gensim': '4.3.1', 'python': '3.11.1 (tags/v3.11.1:a7a450f

In [74]:
doc2vec_dm.wv.most_similar(['human_cloning'])

[('subsidized', 0.9850350022315979),
 ('it', 0.969832181930542),
 ('church', 0.9686174988746643),
 ('targeted_killing', 0.9653472304344177),
 ('prayer', 0.9646620154380798),
 ('inhumane', 0.963864803314209),
 ('private_military', 0.958096981048584),
 ('limit', 0.9560912847518921),
 ('therefore', 0.9543260335922241),
 ('strikes_laws', 0.9541164636611938)]

In [75]:
doc2vec_dbow.wv.most_similar(['human_cloning'])

[('motivated', 0.3234930634498596),
 ('un', 0.31286346912384033),
 ('despite', 0.3089261054992676),
 ('creates', 0.3055853545665741),
 ('medicine', 0.3049114942550659),
 ('peer', 0.2901793122291565),
 ('.', 0.2731468379497528),
 ('priority', 0.27057597041130066),
 ('available', 0.26853522658348083),
 ('abolish', 0.2580111026763916)]

In [None]:
doc2vec_dm.tra

In [65]:
data[0].words

['we',
 'should',
 'ban',
 'human_cloning',
 'as',
 'it',
 'will',
 'only',
 'cause',
 'huge',
 'issues',
 'when',
 'you',
 'have',
 'a',
 'bunch',
 'of',
 'the',
 'same',
 'humans',
 'running',
 'around',
 'all',
 'acting',
 'the',
 'same',
 '.']

In [None]:

# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# # Example: Train a Doc2Vec model
# model = Doc2Vec(data, vector_size=100, window=5, min_count=2, epochs=20)
# model.build_vocab(data)

# # Function to evaluate similarity
# def evaluate_similarity(model, documents, values):
#     doc_vectors = [model.infer_vector(doc) for doc in documents]
#     similarity_scores = []

#     for i in range(len(documents)):
#         for j in range(i + 1, len(documents)):
#             if values[i] == values[j]:
#                 sim_score = cosine_similarity([doc_vectors[i]], [doc_vectors[j]])[0][0]
#                 similarity_scores.append(sim_score)

#     return np.mean(similarity_scores)

# # Evaluate the model
# average_similarity = evaluate_similarity(model, documents, values)
# print("Average Similarity:", average_similarity)