Import all the super important and useful libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.neural_network import MLPRegressor

Read in the four datasets

In [2]:
raw_data_dp = pd.read_csv('data/dp-slider-means.csv')
raw_data_evo = pd.read_csv('data/evo-slider-means.csv')
raw_data_gc = pd.read_csv('data/gc-slider-means.csv')
raw_data_gm = pd.read_csv('data/gm-slider-means.csv')

Show the amount of rows and columns for each dataset

In [3]:
n_rows = len(raw_data_dp) + len(raw_data_evo) + len(raw_data_gc) + len(raw_data_gm)

print("The data contains {0} columns".format(len(raw_data_dp.columns)))
print("Amount of rows:\n total: {0} \n dp: {1} \n evo: {2}\n gc: {3}\n gm: {4}".format(n_rows, len(raw_data_dp),
                                                                                       len(raw_data_evo), 
                                                                         len(raw_data_gc), len(raw_data_gm)))

The data contains 8 columns
Amount of rows:
 total: 5375 
 dp: 987 
 evo: 1252
 gc: 1590
 gm: 1546


Combine all four datasets into one data frame

In [4]:
frames = [raw_data_dp, raw_data_evo, raw_data_gc, raw_data_gm]

In [5]:
raw_data = pd.concat(frames, axis = 0)

In [6]:
print("The combined dataset now contains {0} rows and {1} columns".format(len(raw_data), len(raw_data.columns)))

The combined dataset now contains 5375 rows and 8 columns


Inspect the first 5 rows of the new dataset

In [7]:
raw_data.head()

Unnamed: 0.1,Unnamed: 0,ItemId,GoodSliderMean,GoodSliderDev,Connective.x,PairType.x,ResponseInitial.x,Phrase.x
0,658,ab5810d83f23243ddce713ac23d775cd,1.0,,so,P1_P2,False,"Sorry for the length of the post, but I hope i..."
1,871,e0a35a65ce12b2457e8ff1f9b8cec749,1.0,0.0,no_connective,P1_P2,True,I am all for the death penalty.
2,931,f16863ac9454707946061848c7e9a3e5,1.0,0.0,no_connective,P1_P2,True,I am pro death penalty.
3,936,f1f99c6b1f3f14025a3c01cb8a13b10b,1.0,,no_connective,P1_P2,False,"I can't believe that you just said ""So what if..."
4,11,029bc4e01ac943f87837556b32d5627a,0.999,0.001414,so,QR,False,So what does he have to do with a debate like ...


We are only interested in the argument score and the argument itself, so only keep that

In [8]:
raw_data = raw_data[["GoodSliderMean", "Phrase.x"]]

Now, we only have the arguments and its annotated score

In [9]:
raw_data.head()

Unnamed: 0,GoodSliderMean,Phrase.x
0,1.0,"Sorry for the length of the post, but I hope i..."
1,1.0,I am all for the death penalty.
2,1.0,I am pro death penalty.
3,1.0,"I can't believe that you just said ""So what if..."
4,0.999,So what does he have to do with a debate like ...


The data is now ordered by topic and by argument score, so shuffle it before using it for classification

In [10]:
raw_data = raw_data.sample(frac = 1)

The data is now properly shuffled

In [11]:
raw_data.head()

Unnamed: 0,GoodSliderMean,Phrase.x
986,0.421333,But it is also memorable how Hamilton defined ...
1276,0.273,So in reviewing the replies I see a few others...
1114,0.321667,If you mean transitional between consecutive s...
1311,0.248,"First, I wish to thank you for getting this th..."
959,0.132,"If there have been ""many"" cases why can't you ..."


In [12]:
sentences = raw_data["Phrase.x"].values

In [13]:
labels = raw_data["GoodSliderMean"].values

Define a function that encodes the sentences into usable vectors for classification

In [14]:
def encode_sentences(sentences):
    dataset = []
    for i in range(len(sentences)):
        # tokenize the sentence
        tokens = nltk.word_tokenize(sentences[i])
        # tag the all the tokens
        pos_tokens = nltk.pos_tag(tokens)
        # for each pos tag, count how many times it occurs in the sentence
        pos_dict = generate_count_dict()
        for tag in pos_tokens:
            if tag[1] in pos_dict:
                pos_dict[tag[1]] += 1
        # get the pos tag counts as features 
        feature_vector = list(pos_dict.values())
        dataset.append(feature_vector)
    return dataset

In [15]:
def generate_count_dict():
    tags = ['IN', 'PRP', 'VBP', 'TO', 'VB', 'JJ', 'NN', 'VBZ', 'VBN', 'DT', ',', 'NNP', 'VBD', '.']
    dic = dict.fromkeys(tags, 0)
    return dic
    

In [21]:
dataset = encode_sentences(sentences)

Fit a MLP on the dataset

In [22]:
mlp = MLPRegressor(max_iter = 1000)

In [23]:
mlp.fit(dataset, labels)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

See how w

In [25]:
mlp.score(dataset, labels)

0.20559518499098794