In [326]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from transformers import BertTokenizer, BertModel
from sklearn.svm import LinearSVC

from bert import optimization
from bert import tokenization
from datetime import datetime
import en_core_web_trf
import xgboost as xgb
from sklearn.svm import SVC
from sklearn import *
from keras.layers import *

from keras.models import Sequential
from keras.layers import Dense
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import Embedding, LSTM, Dense

import tensorflow as tf
import tensorflow_hub as hub
import tf_slim as slim

from xgboost import XGBClassifier
import xgboost

import collections
import transformers
import numpy as np
import pandas as pd
import altair as alt
from empath import Empath
from typing import List

import re
import os
import bs4
import bert
import spacy
import string
import keras
import json

In [41]:
# Load the datasets as pd.DataFrames
arguments_training = pd.read_csv('arguments-training.tsv', sep='\t')
labels_training = pd.read_csv('labels-training.tsv', sep='\t')

# Load value-categories.json
with open('value-categories.json') as f:
    value_categories = json.load(f)

# this is a list of all value categories and subcategories
all_categories = []
for category, subcategories in value_categories.items():
    all_categories.append(category)

# arguments including labels
arguments_data = {}
with open('arguments-training.tsv') as f:
    for line in f:
        argument_id, conclusion, stance, premise = line.strip().split('\t')
        arguments_data[argument_id] = {
            'premise': premise,
            'conclusion': conclusion,
            'stance': stance,
            'categories': []
        }

# labels to arguments
with open('labels-training.tsv') as f:
    for line in f:
        argument_id, *labels = line.strip().split('\t')
        arguments_data[argument_id]['categories'] = [
            category
            for category, label in zip(all_categories, labels)
            if label == '1'
        ]

In [None]:
# Create a pd.DataFrame that contains the premise, conclusion, stance, and categories for each argument
arguments_data = pd.DataFrame({
    'premise': arguments_training['Premise'],
    'conclusion': arguments_training['Conclusion'],
    'stance': arguments_training['Stance'],
    'categories': labels_training.drop(columns=['Argument ID']).apply(lambda x: x.astype(bool).index[x].tolist(), axis=1)
})

In [170]:
def roomba(item):
    '''
    input: string
    output: list of cleaned words

    '''
    soup = bs4.BeautifulSoup(item, 'html.parser')
    text = soup.get_text()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')

    text = re.sub(r'\[.*\]\(.*\)', '', text)

    # remove '[removed]' and '[deleted]'
    text = re.sub(r'\[.*\]', '', text)

    # remove non utf-8 characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # lowercase 
    text = text.lower()

    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    # remove leading and trailing spaces
    text = text.strip()

    # remove urls
    text = re.sub(r'http\S+', '', text)

    # remove stopwords
    text = [word for word in text.split() if word not in stopwords]
    text = [lemmatizer.lemmatize(word) for word in text]

    if text == '':
        return False
    else:
        return text

In [171]:
import nltk

# create a tokenizer with NLTK
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
tokenizer = word_tokenize

# create a lemmatizer with NLTK
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [172]:
# run the roomba function on the arguments_training dataframe append a 'cleaned' column
arguments_training['Cleaned_Premise'] = arguments_training['Premise'].apply(roomba)
arguments_training['Cleaned_Conclusion'] = arguments_training['Conclusion'].apply(roomba)

arguments_training['Cleaned_Premise'] = arguments_training['Premise'].apply(roomba)
arguments_training['Cleaned_Conclusion'] = arguments_training['Conclusion'].apply(roomba)

# combine 'conclusion', 'premise', and 'stance' into one column, then run roomba function
arguments_training['Cleaned_BOW'] = arguments_training['Premise'] + ' ' + arguments_training['Conclusion'] + ' ' + arguments_training['Stance']
training_data = pd.merge(labels_training, arguments_training, on='Argument ID')



In [45]:
# import embedding models
path_to_gloVe_file = "glove.6B.200d.txt"

# dictionary of words to embeddings
embeddings_index = {}

with open(path_to_gloVe_file, encoding='utf8') as f:
  for line in f:
    word, coefs = line.split(maxsplit = 1)
    coefs = np.fromstring(coefs, "f", sep = " ")
    embeddings_index[word] = coefs

In [46]:
# find argument with longest length of words
max_tokens = arguments_training['Cleaned_BOW'].str.split().str.len().max()
data = arguments_training['Cleaned_BOW']

# This text vectorizer indexs our vocabulary based on the train sampl
vectorizer = TextVectorization(max_tokens = max_tokens, output_sequence_length = 100, split = 'whitespace')
data_tensor = tf.data.Dataset.from_tensor_slices(data).batch(32)
vectorizer.adapt(data_tensor)

# map unique words to integers
vocabulary = vectorizer.get_vocabulary()
index = dict(zip(vocabulary, range(len(vocabulary))))
len_vocabulary = len(vocabulary)

In [47]:
# create embedding matrix
embedding_dim = 200

# going to count hits and misses
hits = 0
misses = 0


embedding_matrix = np.zeros((len_vocabulary, embedding_dim))
for word, i in index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
    hits+=1
  else:
    misses+=1

embedding_layer = Embedding(len_vocabulary, embedding_dim, embeddings_initializer=Constant(embedding_matrix), trainable = False)


print("Embedding coverage: ", round((hits/(hits+misses))*100,2),"%")
print("Captured words: ", hits)
print("Missed words: ", misses)

Embedding coverage:  97.92 %
Captured words:  141
Missed words:  3


### Create token list for each value category

In [48]:
stop_words = set(stopwords.words('english'))

tokens = {}

for category, values in value_categories.items():
  tokens[category] = []
  for examples in values.values():
    tokens[category].extend([token for example in examples for token in example.split()])

print(tokens['Self-direction: thought'][:10])

for category in tokens.keys():
  tokens[category] = [token.lower() for token in tokens[category]]
  tokens[category] = [token for token in tokens[category] if token not in stop_words]
  tokens[category] = [lemmatizer.lemmatize(token) for token in tokens[category]]


print(tokens['Self-direction: thought'][:10])

['allowing', 'for', 'more', 'creativity', 'or', 'imagination', 'being', 'more', 'creative', 'fostering']
['allowing', 'creativity', 'imagination', 'creative', 'fostering', 'creativity', 'promoting', 'imagination', 'interesting', 'option']


In [67]:
def fits_like_a_glove(words):
    word_vectors = np.zeros((len(words), embedding_dim))
    for i, word in enumerate(words):
        if word in vocabulary:
            word_vectors[i] = embeddings_index[word]

    input_vector = np.sum(word_vectors, axis=0)
    return input_vector

## Calculate cosine similarity

In [111]:
def fits_like_a_glove(words):
    word_vectors = np.zeros((len(words), embedding_dim))
    for i, word in enumerate(words):
        if word in embeddings_index:
            word_vectors[i] = embeddings_index[word]
        else:
            word_vectors[i] = np.random.uniform(-1, 1, embedding_dim)

    input_vector = np.sum(word_vectors, axis=0)
    return input_vector

In [None]:
def calc_cosine(input_vector, category_tokens):
    #categoryString=" "
    #categoryString = categoryString.join(category_tokens)
    #print(categoryString)
    category_vector = fits_like_a_glove(category_tokens)
    #print(input_vector)
    #print(category_vector)
    similarity = cosine_similarity([input_vector], [category_vector])

    return similarity[0][0]

def predict_category(input_text):
    similarity_scores = {}

    input_vector = fits_like_a_glove(input_text)

    for category in value_category:
        similarity_scores[category] = calc_cosine(input_vector, category)
    return max(similarity_scores, key=similarity_scores.get)

cosine_similarities = []

for input_text in training_data['Cleaned_BOW']:
    input_vector = fits_like_a_glove(input_text.split())
    for category in all_categories:
        value = [calc_cosine(input_vector, tokens[category])]
        #print(value)
        cosine_similarities.append(value)
        
value_category_list = list(value_categories.keys())


cos_sims = [item for sublist in cosine_similarities for item in sublist]

# output the cosine similarity score for each respective value category in a new column in arguments_training
for category in value_category_list:
    i = value_category_list.index(category)
    arguments_training[f'Cosine_Similarity_{category}'] = np.reshape(cos_sims,(int(len(cos_sims)/len(value_category_list)), len(value_category_list)))[:,i]

### Find an overlap of tokens between input text and category values

In [135]:
binary_features = {}

def contains_tokens(input_tokens, tokens):
  input_tokens = set(input_tokens)
  tokens = set(tokens)
  return len(input_tokens.intersection(tokens)) / len(input_tokens)

binary_features = {}

for category in tokens.keys():
  binary_features[category] = arguments_training['Cleaned_BOW'].apply(lambda x: contains_tokens(x.split(), tokens[category]))

for category, feature in binary_features.items():
  arguments_training[f'contains_{category}'] = feature

### Add additional features (word count)

In [137]:
# add the length of the premise and word count as additional columns in the arguments_training dataframe
arguments_training['premise_length'] = arguments_training['Premise'].apply(len)

# normalize the length of the premise and word count columns
arguments_training['premise_length'] = arguments_training['premise_length'] / arguments_training['premise_length'].max()

In [140]:
arguments_training.to_csv('arguments_training.csv', index=False)

### Split test, train, and validation sets

In [259]:
# drop index columns in arguments_training
feature_values = arguments_training.drop(['Argument ID', 'Conclusion', 'Stance', 'Premise', 'Cleaned_Premise',
       'Cleaned_Conclusion', 'Cleaned_BOW', 'word_count'], axis=1)

labels = labels_training.drop(['Argument ID'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(feature_values, labels, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

len(X_train), len(X_val), len(X_test), len(y_train), len(y_val), len(y_test)

(4176, 522, 522, 4176, 522, 522)

In [None]:
feature_columns = {
'Self-direction: thought': ['contains_Self-direction: thought', 'Cosine_Similarity_Self-direction: thought', 'premise_length'],
'Self-direction: action': ['contains_Self-direction: action', 'Cosine_Similarity_Self-direction: action', 'premise_length'],
'Stimulation': ['contains_Stimulation', 'Cosine_Similarity_Stimulation', 'premise_length'],
'Hedonism': ['contains_Hedonism', 'Cosine_Similarity_Hedonism', 'premise_length'],
'Achievement': ['contains_Achievement', 'Cosine_Similarity_Achievement', 'premise_length'],
'Power: dominance': ['contains_Power: dominance', 'Cosine_Similarity_Power: dominance', 'premise_length'],
'Power: resources': ['contains_Power: resources', 'Cosine_Similarity_Power: resources', 'premise_length'],
'Face': ['contains_Face', 'Cosine_Similarity_Face', 'premise_length'],
'Security: personal': ['contains_Security: personal', 'Cosine_Similarity_Security: personal', 'premise_length'],
'Security: societal': ['contains_Security: societal', 'Cosine_Similarity_Security: societal', 'premise_length'],
'Tradition': ['contains_Tradition', 'Cosine_Similarity_Tradition', 'premise_length'],
'Conformity: rules': ['contains_Conformity: rules', 'Cosine_Similarity_Conformity: rules', 'premise_length'],
'Conformity: interpersonal': ['contains_Conformity: interpersonal', 'Cosine_Similarity_Conformity: interpersonal', 'premise_length'],
'Humility': ['contains_Humility', 'Cosine_Similarity_Humility', 'premise_length'],
'Benevolence: caring': ['contains_Benevolence: caring', 'Cosine_Similarity_Benevolence: caring', 'premise_length'],
'Benevolence: dependability': ['contains_Benevolence: dependability', 'Cosine_Similarity_Benevolence: dependability', 'premise_length'],
'Universalism: concern': ['contains_Universalism: concern', 'Cosine_Similarity_Universalism: concern', 'premise_length'],
'Universalism: nature': ['contains_Universalism: nature', 'Cosine_Similarity_Universalism: nature', 'premise_length'],
'Universalism: tolerance': ['contains_Universalism: tolerance', 'Cosine_Similarity_Universalism: tolerance', 'premise_length'],
'Universalism: objectivity': ['contains_Universalism: objectivity', 'Cosine_Similarity_Universalism: objectivity', 'premise_length']
}

In [None]:
KERAS_MODELS = {}

for category, features in feature_columns.items():
    print(f'Training model for {category}')
    model = Sequential()
    model.add(Dense(64, input_dim=len(features), activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train[features], y_train[category], epochs=10, batch_size=10, verbose=0)
    KERAS_MODELS[category] = model
    
predictions = {}

for category, features in feature_columns.items():
    print(f'Predicting labels for {category}')
    predictions[category] = KERAS_MODELS[category].predict(X_val[features])

In [None]:
KERAS = {}

for category, features in feature_columns.items():
    KERAS[category] = [f1_score(y_val[category], predictions[category].round()), accuracy_score(y_val[category], predictions[category].round()), recall_score(y_val[category], predictions[category].round()), precision_score(y_val[category], predictions[category].round())]


KERAS_scores = pd.DataFrame.from_dict(KERAS, orient='index', columns=['F1', 'Accuracy', 'Recall', 'Precision'])

In [None]:
Linear_SVC_scores = pd.DataFrame(columns=['F1-Micro', 'F1-Macro', 'Accuracy', 'Recall', 'Precision'])
for label, columns in feature_columns.items():
    # Select the relevant columns from X_train and X_val
    X_train_subset = X_train[columns]
    X_val_subset = X_val[columns]

    classifier = LinearSVC(random_state=42)
    classifier.fit(X_train_subset, y_train[label])
    y_pred = classifier.predict(X_val_subset)


    f1_micro = f1_score(y_val[label], y_pred, average='micro')
    f1_macro = f1_score(y_val[label], y_pred, average='macro')
    accuracy = accuracy_score(y_val[label], y_pred)
    recall = recall_score(y_val[label], y_pred, average='macro')
    precision = precision_score(y_val[label], y_pred, average='macro')

    # print('Label: {} - F1 score: {}'.format(label, f1))
    # print('Label: {} - Accuracy: {}'.format(label, accuracy))
    # print('Label: {} - Recall: {}'.format(label, recall))
    # print('Label: {} - Precision: {}'.format(label, precision))

    # add a row to the dataframe for each label
    Linear_SVC_scores.loc[label] = [f1_micro, f1_macro, accuracy, recall, precision]

In [334]:
XGB_scores = pd.DataFrame(columns=['F1-Micro', 'F1-Macro', 'Accuracy', 'Recall', 'Precision'])
for label, columns in feature_columns.items():
    # Select the relevant columns from X_train and X_val
    X_train_subset = X_train[columns]
    X_val_subset = X_val[columns]

    classifier = XGBClassifier(random_state=42)
    classifier.fit(X_train_subset, y_train[label])
    y_pred = classifier.predict(X_val_subset)


    f1_micro = f1_score(y_val[label], y_pred, average='micro')
    f1_macro = f1_score(y_val[label], y_pred, average='macro')
    accuracy = accuracy_score(y_val[label], y_pred)
    recall = recall_score(y_val[label], y_pred, average='macro')
    precision = precision_score(y_val[label], y_pred, average='macro')

    # print('Label: {} - F1 score: {}'.format(label, f1))
    # print('Label: {} - Accuracy: {}'.format(label, accuracy))
    # print('Label: {} - Recall: {}'.format(label, recall))
    # print('Label: {} - Precision: {}'.format(label, precision))

    # add a row to the dataframe for each label
    XGB_scores.loc[label] = [f1_micro, f1_macro, accuracy, recall, precision]

In [336]:
scores = pd.concat([Linear_SVC_scores, XGB_scores, KERAS_scores], axis=1, keys=['LinearSVC', 'XGB', 'Keras'])

In [337]:
scores['LinearSVC']['Winner'] = scores['LinearSVC'].idxmax(axis=1)
scores['XGB']['Winner'] = scores['XGB'].idxmax(axis=1)
scores['Keras']['Winner'] = scores['Keras'].idxmax(axis=1)

scores['Winner'] = scores.idxmax(axis=1)

scores


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scores['LinearSVC']['Winner'] = scores['LinearSVC'].idxmax(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scores['XGB']['Winner'] = scores['XGB'].idxmax(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scores['Keras']['Winner'] = scores['Keras'].idxmax(axis=1)


Unnamed: 0_level_0,LinearSVC,LinearSVC,LinearSVC,LinearSVC,LinearSVC,XGB,XGB,XGB,XGB,XGB,Keras,Keras,Keras,Keras,Winner
Unnamed: 0_level_1,F1-Micro,F1-Macro,Accuracy,Recall,Precision,F1-Micro,F1-Macro,Accuracy,Recall,Precision,F1,Accuracy,Recall,Precision,Unnamed: 15_level_1
Self-direction: thought,0.823755,0.451681,0.823755,0.5,0.411877,0.808429,0.474997,0.808429,0.503514,0.519544,0.0,0.823755,0.0,0.0,"(LinearSVC, F1-Micro)"
Self-direction: action,0.747126,0.448891,0.747126,0.511111,0.872832,0.718391,0.521188,0.718391,0.53273,0.569967,0.043478,0.747126,0.022222,1.0,"(Keras, Precision)"
Stimulation,0.95977,0.489736,0.95977,0.5,0.479885,0.955939,0.488737,0.955939,0.498004,0.479808,0.0,0.95977,0.0,0.0,"(LinearSVC, F1-Micro)"
Hedonism,0.969349,0.492218,0.969349,0.5,0.484674,0.95977,0.489736,0.95977,0.495059,0.484526,0.0,0.969349,0.0,0.0,"(LinearSVC, F1-Micro)"
Achievement,0.695402,0.410169,0.695402,0.5,0.347701,0.672414,0.502605,0.672414,0.524118,0.551724,0.0,0.695402,0.0,0.0,"(LinearSVC, F1-Micro)"
Power: dominance,0.908046,0.475904,0.908046,0.5,0.454023,0.900383,0.47379,0.900383,0.495781,0.453668,0.0,0.908046,0.0,0.0,"(LinearSVC, F1-Micro)"
Power: resources,0.875479,0.466803,0.875479,0.5,0.437739,0.877395,0.522634,0.877395,0.527487,0.726491,0.0,0.875479,0.0,0.0,"(XGB, F1-Micro)"
Face,0.925287,0.480597,0.925287,0.5,0.462644,0.921456,0.479561,0.921456,0.49793,0.4625,0.0,0.925287,0.0,0.0,"(LinearSVC, F1-Micro)"
Security: personal,0.641762,0.463314,0.641762,0.540305,0.787698,0.590038,0.536096,0.590038,0.540976,0.549394,0.107143,0.616858,0.059113,0.571429,"(LinearSVC, Precision)"
Security: societal,0.689655,0.408163,0.689655,0.5,0.344828,0.641762,0.492638,0.641762,0.509414,0.516022,0.0,0.689655,0.0,0.0,"(LinearSVC, F1-Micro)"


In [None]:
X_train_final = pd.concat([X_train, X_val])
y_train_final = pd.concat([y_train, y_val])

final_run_scores = pd.DataFrame(columns=['F1-Micro', 'F1-Macro', 'Accuracy', 'Recall', 'Precision'])
for label, columns in feature_columns.items():
    X_train_subset = X_train_final[columns]
    X_val_subset = X_test[columns]

    classifier = LinearSVC(random_state=42)
    classifier.fit(X_train_subset, y_train_final[label])
    y_pred = classifier.predict(X_val_subset)

    f1_micro = f1_score(y_test[label], y_pred, average='micro')
    f1_macro = f1_score(y_test[label], y_pred, average='macro')
    accuracy = accuracy_score(y_test[label], y_pred)
    recall = recall_score(y_test[label], y_pred, average='macro')
    precision = precision_score(y_test[label], y_pred, average='macro')

    final_run_scores.loc[label] = [f1_micro, f1_macro, accuracy, recall, precision]

In [329]:
final_run_scores

Unnamed: 0,F1-Micro,F1-Macro,Accuracy,Recall,Precision
Self-direction: thought,0.795019,0.442903,0.795019,0.5,0.39751
Self-direction: action,0.752874,0.436987,0.752874,0.503846,0.8762
Stimulation,0.963602,0.490732,0.963602,0.5,0.481801
Hedonism,0.978927,0.494676,0.978927,0.5,0.489464
Achievement,0.731801,0.422566,0.731801,0.5,0.3659
Power: dominance,0.909962,0.476429,0.909962,0.5,0.454981
Power: resources,0.911877,0.476954,0.911877,0.5,0.455939
Face,0.940613,0.484699,0.940613,0.5,0.470307
Security: personal,0.64751,0.447644,0.64751,0.521236,0.639845
Security: societal,0.691571,0.408834,0.691571,0.5,0.345785


In [330]:
# calculate average f1 score for the final model
print('Final results F1 micro:', final_run_scores['F1-Micro'].mean())
print('Final results F1 macro:', final_run_scores['F1-Macro'].mean())


Final results F1 micro: 0.8348659003831417
Final results F1 macro: 0.45859234932832554
