Import all the super important and useful libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler

Read in the four datasets

In [2]:
raw_data_dp = pd.read_csv('data/dp-slider-means.csv')
raw_data_evo = pd.read_csv('data/evo-slider-means.csv')
raw_data_gc = pd.read_csv('data/gc-slider-means.csv')
raw_data_gm = pd.read_csv('data/gm-slider-means.csv')

Show the amount of rows and columns for each dataset

In [3]:
n_rows = len(raw_data_dp) + len(raw_data_evo) + len(raw_data_gc) + len(raw_data_gm)

print("The data contains {0} columns".format(len(raw_data_dp.columns)))
print("Amount of rows:\n total: {0} \n dp: {1} \n evo: {2}\n gc: {3}\n gm: {4}".format(n_rows, len(raw_data_dp),
                                                                                       len(raw_data_evo), 
                                                                         len(raw_data_gc), len(raw_data_gm)))

The data contains 8 columns
Amount of rows:
 total: 5375 
 dp: 987 
 evo: 1252
 gc: 1590
 gm: 1546


Combine all four datasets into one data frame

In [4]:
frames = [raw_data_dp, raw_data_evo, raw_data_gc, raw_data_gm]

In [5]:
raw_data = pd.concat(frames, axis = 0)

In [6]:
print("The combined dataset now contains {0} rows and {1} columns".format(len(raw_data), len(raw_data.columns)))

The combined dataset now contains 5375 rows and 8 columns


Inspect the first 5 rows of the new dataset

In [7]:
raw_data.head()

Unnamed: 0.1,Unnamed: 0,ItemId,GoodSliderMean,GoodSliderDev,Connective.x,PairType.x,ResponseInitial.x,Phrase.x
0,658,ab5810d83f23243ddce713ac23d775cd,1.0,,so,P1_P2,False,"Sorry for the length of the post, but I hope i..."
1,871,e0a35a65ce12b2457e8ff1f9b8cec749,1.0,0.0,no_connective,P1_P2,True,I am all for the death penalty.
2,931,f16863ac9454707946061848c7e9a3e5,1.0,0.0,no_connective,P1_P2,True,I am pro death penalty.
3,936,f1f99c6b1f3f14025a3c01cb8a13b10b,1.0,,no_connective,P1_P2,False,"I can't believe that you just said ""So what if..."
4,11,029bc4e01ac943f87837556b32d5627a,0.999,0.001414,so,QR,False,So what does he have to do with a debate like ...


We are only interested in the argument score and the argument itself, so only keep that

In [8]:
raw_data = raw_data[["GoodSliderMean", "Phrase.x"]]

Now, we only have the arguments and its annotated score

In [9]:
raw_data.head()

Unnamed: 0,GoodSliderMean,Phrase.x
0,1.0,"Sorry for the length of the post, but I hope i..."
1,1.0,I am all for the death penalty.
2,1.0,I am pro death penalty.
3,1.0,"I can't believe that you just said ""So what if..."
4,0.999,So what does he have to do with a debate like ...


The data is now ordered by topic and by argument score, so shuffle it before using it for classification

In [10]:
raw_data = raw_data.sample(frac = 1)

The data is now properly shuffled

In [11]:
raw_data.head()

Unnamed: 0,GoodSliderMean,Phrase.x
1032,0.328,If anti-gunners are looking for a pro-gun Bell...
906,0.43975,"See, that's one of the flaws that I see in red..."
1208,0.0975,"so do we trash the whole thing, or what?"
362,0.667333,"But weeks later, Mexican authorities still hav..."
877,0.33075,"If not they would just put them down, like you..."


In [12]:
sentences = raw_data["Phrase.x"].values

In [13]:
labels = raw_data["GoodSliderMean"].values
negatives = labels[labels < 0.55]
positives = labels[labels >= 0.55]
print("Negatives: {0}, Positives: {1}".format(len(negatives) / len(raw_data), len(positives) / len(raw_data)))

labels[labels < 0.6] = 0
labels[labels >= 0.6] = 1
                                                 


Negatives: 0.493953488372093, Positives: 0.5060465116279069


Define a function that encodes the sentences into usable vectors for classification

In [14]:
def encode_sentences(sentences):
    scaler = MinMaxScaler() 
    dataset = []
    for i in range(len(sentences)):
        # tokenize the sentence
        tokens = nltk.word_tokenize(sentences[i])
        # tag the all the tokens
        pos_tokens = nltk.pos_tag(tokens)
        # for each pos tag, count how many times it occurs in the sentence
        pos_dict = generate_count_dict()
        for tag in pos_tokens:
            if tag[1] in pos_dict:
                pos_dict[tag[1]] = 1
        # get the pos tag counts as features 
        feature_vector = list(pos_dict.values())
        # compute the average word length of the sentence
        n_chars = 0
        avg_len = 0
        for token in tokens:
            if(token not in ".,?!"):
                n_chars += len(token)
        avg_len = n_chars / len(tokens)
        
        feature_vector.append(avg_len)
        dataset.append(feature_vector)
    #dataset = pd.DataFrame(dataset)
    #dataset[len(feature_vector) - 1] = dataset[len(feature_vector) - 1] / max(dataset[len(feature_vector) - 1])
    #return dataset.values
    return dataset
    

In [15]:
def generate_count_dict():
    tags = ['IN', 'PRP', 'VBP', 'TO', 'VB', 'JJ', 'NN', 'VBZ', 'VBN', 'DT', 'NNP', 'VBD', '.',
           'CC', 'CD', 'EX', 'FW', 'JJR', 'JJS', 'LS', 'WP', 'WP$', 'WRB']
    dic = dict.fromkeys(tags, 0)
    return dic
    

In [16]:
dataset = encode_sentences(sentences)

In [17]:
N = len(dataset)
Xtrain = dataset[0:int(0.8*len(dataset))]
Ytrain = labels[0:int(0.8*len(dataset))]

Xtest = dataset[int(0.8*len(dataset)):]
Ytest = labels[int(0.8*len(dataset)):]

Fit a MLP on the dataset

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
lm = LogisticRegression()
svm =  LinearSVC()

In [19]:
mlp.fit(Xtrain, Ytrain)
lm.fit(Xtrain, Ytrain)
svm.fit(Xtrain, Ytrain)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

See how well the classifiers score

In [20]:
print(mlp.score(Xtest, Ytest))
print(lm.score(Xtest, Ytest))
print(svm.score(Xtest, Ytest))

0.6353488372093024
0.6316279069767442
0.6241860465116279


Define a function that splits text into argument sentences and non-argument/bad argument sentences

In [21]:
def filter_post(text):
    sentences = text.split('.')[:-1] 
    encoded_sentences = encode_sentences(sentences)
    filtered_text = []
    removed_sentences = []
    for i in range(len(encoded_sentences)):
        prediction = mlp.predict([encoded_sentences[i]])
        if(prediction == 1):
            filtered_text.append(sentences[i])
        else:
            removed_sentences.append(sentences[i])

    print("{0} filtered from the input text!".format(len(removed_sentences)))
    return filtered_text, removed_sentences
        
    

Clas

In [22]:
OP_post = "The classic example of this would be a person’s Grandparents that grew up in a different time where it was not considered a bad thing to classify and ultimately look down on a person based on their perceived racial category. Many of not most of the time this person has had limited sustained contact with people of other races and is at times arrogantly used to making broad statements about members of certain racial categories. Most people people don’t have any problems referring their grandparents as racists. Their way of thinking didn’t and doesn’t automatically prevent them from being a good parent, spouse, grandparent, citizen, etc. The difficulty today is that people are unwilling to come to terms with the fact that they have racist beliefs, often inherited from older generations that can subtly (and not so subtly) come out in their interactions with people of different races. Racist to them equal someone that is evil and it feels bad to be labeled that way. Ultimately being a racist is a negative quality that many otherwise good people have. Being racist can and does cause harm to others and people should be proactive in changing their racist beliefs as a method of self improvement. There certainly are evil people who maliciously harbor racial hatred against others that are racists too. We often consider them more representative of the term Racist than your Grandparents or instance but this is just a matter of degree.If a person calls you a racist there is always the possibility that they are right. This should be an opportunity for self reflection rather than an automatic denial."

text, filtered = filter_post(OP_post)

7 filtered from the input text!


In [23]:
text

['The classic example of this would be a person’s Grandparents that grew up in a different time where it was not considered a bad thing to classify and ultimately look down on a person based on their perceived racial category',
 ' The difficulty today is that people are unwilling to come to terms with the fact that they have racist beliefs, often inherited from older generations that can subtly (and not so subtly) come out in their interactions with people of different races',
 ' Racist to them equal someone that is evil and it feels bad to be labeled that way',
 ' Ultimately being a racist is a negative quality that many otherwise good people have',
 ' Being racist can and does cause harm to others and people should be proactive in changing their racist beliefs as a method of self improvement']

In [24]:
filtered

[' Many of not most of the time this person has had limited sustained contact with people of other races and is at times arrogantly used to making broad statements about members of certain racial categories',
 ' Most people people don’t have any problems referring their grandparents as racists',
 ' Their way of thinking didn’t and doesn’t automatically prevent them from being a good parent, spouse, grandparent, citizen, etc',
 ' There certainly are evil people who maliciously harbor racial hatred against others that are racists too',
 ' We often consider them more representative of the term Racist than your Grandparents or instance but this is just a matter of degree',
 'If a person calls you a racist there is always the possibility that they are right',
 ' This should be an opportunity for self reflection rather than an automatic denial']

In [25]:
counter = "Ok so I'm not if I should be changing your view on if racists are evil, or if racists are good people. Personally I don't believe racists are good people, I also don't believe they are bad or evil people either, UNLESS they use their ignorance to either do harm or not do anything. For example someone doesn't like black people and refuses to hire them at their work, or refuses to help a black person who is in need.By that same token it's the same with people who don't like kids for instance. You're not good, bad, or evil, UNLESS you use it to harm or not help. Like people who don't like kids and refuse to help children in need, or hurt children. Racial biases, stereotypes, and prejudices exist in the older generations for sure, and no I wouldn't say it is evil, but I wouldn't classify them as good people either. My grandmother is a racist and I don't classify her as good, because it's ignorant as fuck. I also don't classify her as evil for holding beliefs that she hasn't promoted or done harm with. Doesn't mean I have to like her or associate with her for example."
text, filtered = filter_post(counter)

8 filtered from the input text!


In [26]:
text

[" Personally I don't believe racists are good people, I also don't believe they are bad or evil people either, UNLESS they use their ignorance to either do harm or not do anything",
 " For example someone doesn't like black people and refuses to hire them at their work, or refuses to help a black person who is in need"]

In [27]:
filtered

["Ok so I'm not if I should be changing your view on if racists are evil, or if racists are good people",
 "By that same token it's the same with people who don't like kids for instance",
 " You're not good, bad, or evil, UNLESS you use it to harm or not help",
 " Like people who don't like kids and refuse to help children in need, or hurt children",
 " Racial biases, stereotypes, and prejudices exist in the older generations for sure, and no I wouldn't say it is evil, but I wouldn't classify them as good people either",
 " My grandmother is a racist and I don't classify her as good, because it's ignorant as fuck",
 " I also don't classify her as evil for holding beliefs that she hasn't promoted or done harm with",
 " Doesn't mean I have to like her or associate with her for example"]