In [1]:
import pandas as pd
import numpy as np
import nltk
import csv

Concatenate and inspect the combined data

In [2]:
raw_data_abo = pd.read_csv("data/abortion.tsv", sep = "\t")
raw_data_clo = pd.read_csv("data/cloning.tsv", sep = "\t")
raw_data_dp = pd.read_csv("data/death_penalty.tsv", sep = "\t") 
raw_data_gun = pd.read_csv("data/gun_control.tsv", sep = "\t")
raw_data_mari = pd.read_csv("data/marijuana_legalization.tsv", sep = "\t")
raw_data_wage = pd.read_csv("data/minimum_wage.tsv", sep = "\t")
raw_data_nuc = pd.read_csv("data/nuclear_energy.tsv", sep = "\t")
raw_data_school = pd.read_csv("data/school_uniforms.tsv", sep = "\t", quoting=csv.QUOTE_NONE)

In [3]:
frames = [raw_data_abo, raw_data_clo, raw_data_dp, raw_data_gun, raw_data_mari, raw_data_wage, raw_data_nuc, raw_data_school]

In [4]:
raw_data = pd.concat(frames, axis = 0)

In [5]:
print("The combined dataset contains {0} rows and {1} columns".format(len(raw_data), len(raw_data.columns)))

The combined dataset contains 24507 rows and 7 columns


In [6]:
raw_data = raw_data[["sentence", "annotation"]]

In [7]:
raw_data.head()

Unnamed: 0,sentence,annotation
0,This means it has to steer monetary policy to ...,NoArgument
1,Where did you get that ?,NoArgument
2,Nathanson later became pro-life .,NoArgument
3,In this case we may never do evil ( directly a...,Argument_against
4,With that I would like to give everyone someth...,NoArgument


Encode the features in an appropriate form

In [8]:
def generate_POS_dict(sentences):
    # extract all the occurring parts-of-speech tags that occur in the corpus
    pos_dict = {}
    
    for i in range(len(sentences)):
        # tokenize and pos tag 
        tokens = nltk.word_tokenize(sentences[i])
        pos_tokens = nltk.pos_tag(tokens)
        for token in pos_tokens:
            if(token[1] not in pos_dict):
                pos_dict[token[1]] = 0
    return pos_dict

In [9]:
from nltk.corpus import stopwords

def generate_BOW_dict(sentences):
    # extract all the occurring words in the corpus
    bow_dict = {}
    for i in range(len(sentences)):
        #tokenize the sentence
        tokens = nltk.word_tokenize(sentences[i])
        for token in tokens:
            # count how many times a word occurs in the texts
            if(token.lower() in bow_dict):
                bow_dict[token.lower()] += 1 
            # add a new word if it's not a stopword or a punctuation mark
            elif(token.lower() not in stopwords.words('English') and token.lower()[0] not in [".,:;!?<>{}()1234567890"]):
                bow_dict[token.lower()] = 1
                
    # filter out the words that barely occur
    frequency_threshold = 5
    dict_size = len(bow_dict)
    words = []
    for key in bow_dict:
        if(bow_dict[key] < frequency_threshold):
            words.append(key)
    
    for word in words:
        bow_dict.pop(word, None)
        
    
                
    return bow_dict
            

In [10]:
from sklearn.preprocessing import MinMaxScaler
def normalize(data):
    # this function applies min-max scaling to a given dataframe
    scaler = MinMaxScaler()
    return scaler.fit_transform(data)

In [11]:
def encode_sentences(sentences, pos_dict, bow_dict):
    dataset = []
    for i in range(len(sentences)):
        # reset POS and word counts for each sentence
        pos_dict = pos_dict.fromkeys(pos_dict, 0)
        bow_dict = pos_dict.fromkeys(bow_dict, 0)
        # tokenize the sentence
        tokens = nltk.word_tokenize(sentences[i])
        # tag the all the tokens
        pos_tokens = nltk.pos_tag(tokens)
        # for each pos tag, count how many times it occurs in the sentence
        for tag in pos_tokens:
            if(tag[1] in pos_dict):
                pos_dict[tag[1]] += 1
        # get the pos tag counts as features 
        pos_vector = list(pos_dict.values())
        
        # count how many times the words from the bow dict occur in the sentence
        for token in tokens:
            if(token.lower() in bow_dict):
                bow_dict[token.lower()] += 1
        bow_vector = list(pos_dict.values())
        
        feature_vector = bow_vector + pos_vector
        #feature_vector = pos_vector
        # add sentence length as a feature
        feature_vector.append(len(tokens))     
        # compute the average word length of the sentence
        n_chars = 0
        avg_len = 0
        for token in tokens:
            if(token[0] not in ".,?!:;"):
                n_chars += len(token)
        avg_len = n_chars / len(tokens)
        feature_vector.append(avg_len)     
        dataset.append(feature_vector)
        
    return dataset
    

In [12]:
raw_data = raw_data.sample(frac=1)

In [13]:
sentences = raw_data.sentence

In [14]:
pos_dict = generate_POS_dict(sentences.values)

In [15]:
bow_dict = generate_BOW_dict(sentences.values)

In [16]:
encoded_sentences = encode_sentences(sentences.values, pos_dict, bow_dict)
encoded_sentences = pd.DataFrame(encoded_sentences)

In [17]:
from sklearn.model_selection import KFold
from sklearn.base import clone

def kfold(data, labels, folds, classifier):
    kf = KFold(n_splits=folds)

    avg_accuracy = 0
    for train_index, test_index in kf.split(data):
        # reset the classifier
        classifier = clone(classifier)
        
        
        Xtrain = data[train_index]
        Xtest = data[test_index]
        
        Ytrain = labels[train_index]
        Ytest = labels[test_index]
        
        mlp.fit(Xtrain, Ytrain)
        avg_accuracy += mlp.score(Xtest, Ytest)

    avg_accuracy /= folds
    print("{0}-fold cross validation accuracy: {1}".format(folds, avg_accuracy))
    

In [18]:
sentence_data = normalize(encoded_sentences)

In [19]:
labels = raw_data.annotation

In [20]:
labels[labels == "NoArgument"] = 0
labels[labels != 0] = 1
labels = labels.values.astype('int')

In [21]:
arguments = labels[labels == 1]
non_arguments = labels[labels == 0]
n_arg = len(arguments)
n_non = len(non_arguments)
print("Arguments: ", n_arg / (n_arg + n_non))
print("Amount of non-arguments: ", n_non / (n_arg + n_non))

Arguments:  0.4374668462072061
Amount of non-arguments:  0.5625331537927939


Inspect the normalized data that's going to be fed to the classifiers

In [22]:
print("Sentence dimensions:", len(sentence_data[0]))
sentence_data[0]

Sentence dimensions: 92


array([0.02222222, 0.01973684, 0.01769912, 0.00680272, 0.02439024,
       0.00406504, 0.01265823, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.02222222, 0.01973684, 0.01769912, 0.00680272, 0.02439024,
       0.00406504, 0.01265823, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [23]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#mlp = MLPRegressor()
#lr = LinearRegression()
#rf = RandomForestRegressor()
mlp = MLPClassifier(max_iter = 300)
lr = LogisticRegression(max_iter = 300)
rf = RandomForestClassifier()

Test different sklearn classifiers with k-folding

In [24]:
"""lrs = [0.1, 0.01, 0.001, 0.0001]
hl = [50, 100, 250, 500]
for learning_rate in lrs:
    for n_hidden in hl:
        mlp = MLPClassifier(max_iter = 300, hidden_layer_sizes = (n_hidden,), alpha = learning_rate)
        print("-----------------Setting: lr: {0} hl: {1}----------------------".format(learning_rate, n_hidden))
        kfold(sentence_data, labels.values, 5, mlp)"""
kfold(sentence_data, labels, 5, mlp)

5-fold cross validation accuracy: 0.6605456001077558


In [25]:
kfold(sentence_data, labels, 5, lr)

5-fold cross validation accuracy: 0.6625453918221338


In [26]:
kfold(sentence_data, labels, 5, rf)

5-fold cross validation accuracy: 0.660953846586734


Fit classifier on the train data normally

In [27]:
mlp = MLPClassifier(max_iter = 300, hidden_layer_sizes = (250,), alpha = 0.1)
lr = LogisticRegression(max_iter = 300)
rf = RandomForestClassifier()
#mlp = MLPRegressor(max_iter = 300, hidden_layer_sizes = (250,), alpha = 0.1)
#lr = LinearRegression()
#rf = RandomForestRegressor()



mlp.fit(sentence_data, labels)
lr.fit(sentence_data, labels)
rf.fit(sentence_data, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Test how the models score arguments taken from Reddit posts

In [28]:
def score_post(text, pos_dict, bow_dict, classifier):
    # given a text, the feature vector will be built just like in the training stage
    # the classifier will then be applied to the sentences and score the arguments within
    sentences = text.split('.')[:-1] 
    # get the POS and BOW encodings for each sentence
    encoded_sentences = encode_sentences(sentences, pos_dict, bow_dict)
    encoded_sentences = pd.DataFrame(encoded_sentences)
    # scale features between 0 and 1 using Min-Max scaling
    encoded_sentences = normalize(encoded_sentences)

    filtered_text = []
    removed_sentences = []
    for i in range(len(encoded_sentences)):
        if(len(sentences[i].split()) < 5):
            removed_sentences.append((0, sentences[i]))
        else:
            score = classifier.predict_proba([encoded_sentences[i]])[0][1]
            if(score >= 0.5):     
                filtered_text.append((score, sentences[i]))
            else:
                removed_sentences.append((score, sentences[i]))

    print("{0} filtered from the input text!".format(len(removed_sentences)))
    return filtered_text, removed_sentences

In [29]:
OP = "The Shape of Water is an extremely overrated movie and should have never won the Oscar for Best Picture I recently rewatched The Shape of Water and I am not a movie critique nor expert, but the realization dawned on me that it is an exquisitely bland movie that lacks an absurd amount of substance. The Shape of Water plays on to the basic beauty and the beast trope, but it does not go any further than that. The movie weighs heavily on the cinematography and strays away from any actual plot or substance. It is an intermediate form of movie writing and does not deserve any more than a Redbox rental. The movie barely dives into the actual underlying foundation for why anything happens, there is no room for individual thought and it is pressed into the viewer’s brain that there is only one way to think and that is with the protagonist. According to Vox, It’s a beautifully shot movie with a story that follows the traditional arcs of a fairy tale romance. I believe that it is exactly why it should not have won, it has been done before. Compared to other past winners, such as Moonlight, which was original and intriguing. There is no relevance to the Shape of Water, no bigger picture. A mute woman falls in love with a sea creature who likes eggs. If that’s the precedent for winning an Oscar, then The Leprechaun would have been a phenomenal candidate. The movie is visually outstanding, but so is The Curious Case of Benjamin Button and it is an incredibly lifeless movie starring Brad Pitt! Without the visuals the movie would merely be a pathetic case for an “original” plot. Quite honestly, coming from Guillermo del Toro I would not expect much, all of his movies rely on visuals such as Crimson Peak or The Hobbit. These movies appeal to the eye and the only Oscar that this movie truly deserved was Best Visuals. Overall, the movie is basic with jaw dropping visuals. The movie won four Oscars, so it is obviously well received and I’d like to understand what is so special about its standard format. Change my view!!"
OP

'The Shape of Water is an extremely overrated movie and should have never won the Oscar for Best Picture I recently rewatched The Shape of Water and I am not a movie critique nor expert, but the realization dawned on me that it is an exquisitely bland movie that lacks an absurd amount of substance. The Shape of Water plays on to the basic beauty and the beast trope, but it does not go any further than that. The movie weighs heavily on the cinematography and strays away from any actual plot or substance. It is an intermediate form of movie writing and does not deserve any more than a Redbox rental. The movie barely dives into the actual underlying foundation for why anything happens, there is no room for individual thought and it is pressed into the viewer’s brain that there is only one way to think and that is with the protagonist. According to Vox, It’s a beautifully shot movie with a story that follows the traditional arcs of a fairy tale romance. I believe that it is exactly why it 

In [30]:
good, bad = score_post(OP, pos_dict, bow_dict, mlp)

1 filtered from the input text!


In [31]:
good

[(0.9189520379966982,
  ' The Shape of Water plays on to the basic beauty and the beast trope, but it does not go any further than that'),
 (0.9264363585159258,
  ' The movie weighs heavily on the cinematography and strays away from any actual plot or substance'),
 (0.9626864491377539,
  ' It is an intermediate form of movie writing and does not deserve any more than a Redbox rental'),
 (0.9152930644592397,
  ' The movie barely dives into the actual underlying foundation for why anything happens, there is no room for individual thought and it is pressed into the viewer’s brain that there is only one way to think and that is with the protagonist'),
 (0.9581909466888983,
  ' According to Vox, It’s a beautifully shot movie with a story that follows the traditional arcs of a fairy tale romance'),
 (0.9339069461317748,
  ' I believe that it is exactly why it should not have won, it has been done before'),
 (0.7621645142307019,
  ' Compared to other past winners, such as Moonlight, which was

In [32]:
bad

[(0.39199134959779175,
  'The Shape of Water is an extremely overrated movie and should have never won the Oscar for Best Picture I recently rewatched The Shape of Water and I am not a movie critique nor expert, but the realization dawned on me that it is an exquisitely bland movie that lacks an absurd amount of substance')]

In [33]:
counter = "The root problem is gendered dresscodes. When men are expected to suit up, you have to put the temperature down quite a bit, because suits are hot. But a women's suit typically has an open neckline and a knee length shirt. For women in office wear, you have to put the temperature way up because they lose a lot more warmth. There is no compromise possible because neither gender is allowed to put on more or less clothing under typical dress codes. Either make dress codes similar for all employees or remove dress codes altogether so that everyone can wear what's appropriate for whatever temperature the janitor set."
counter

"The root problem is gendered dresscodes. When men are expected to suit up, you have to put the temperature down quite a bit, because suits are hot. But a women's suit typically has an open neckline and a knee length shirt. For women in office wear, you have to put the temperature way up because they lose a lot more warmth. There is no compromise possible because neither gender is allowed to put on more or less clothing under typical dress codes. Either make dress codes similar for all employees or remove dress codes altogether so that everyone can wear what's appropriate for whatever temperature the janitor set."

In [34]:
good, bad = score_post(counter, pos_dict, bow_dict, mlp)

0 filtered from the input text!


In [35]:
good

[(0.854478913433192, 'The root problem is gendered dresscodes'),
 (0.9663238781116106,
  ' When men are expected to suit up, you have to put the temperature down quite a bit, because suits are hot'),
 (0.518143107282184,
  " But a women's suit typically has an open neckline and a knee length shirt"),
 (0.984656442615816,
  ' For women in office wear, you have to put the temperature way up because they lose a lot more warmth'),
 (0.9971307758850677,
  ' There is no compromise possible because neither gender is allowed to put on more or less clothing under typical dress codes'),
 (0.9994575524702001,
  " Either make dress codes similar for all employees or remove dress codes altogether so that everyone can wear what's appropriate for whatever temperature the janitor set")]

In [36]:
bad

[]