In [1]:
import gensim
import keras
import numpy as np
from CMVwebscraper import *

Using TensorFlow backend.


Load Google's word2vec model

In [2]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format ('models/GoogleNews-vectors-negative300.bin', binary=True)  

In [3]:
#word2vec.similarity("cat", "dog")

Load in our own Keras LSTM models

In [4]:
model = keras.models.load_model("models/bilstm_cos.h5")

In [5]:
model2 = keras.models.load_model("models/bilstm_evidence.h5")

In [6]:
model3 = keras.models.load_model("models/argument_quality.h5")

In [7]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity

def encode_post(sentences, topic):
    # given a list of sentences and the topic that the sentences
    # are related to, encode the sentences using the word2vec model
    # and compute the similarity of each word to the topic as
    # an additional feature for each word
    encoded_sentences = []
    
    # compute the average word vector of the topic, so that each word
    # from the sentences can be compared to the topic
    topic_words = topic.split()
    topic_word_vectors = []
    for word in topic_words:
        # check if the word exists in the word2vec dictionary
        if(word in word2vec):
                word_vector = word2vec[word]
         # else, map the word to a random, 300-dimensional vector
        else:
            word_vector = np.random.uniform(low = -0.01, high = 0.01, size = (300))
        topic_word_vectors.append(word_vector)
    topic_word_vectors = np.asarray(topic_word_vectors)
    # the average topic vector is the average of all the words in it, along each f
    # the 300 dimensions
    avg_topic_vector = np.mean(topic_word_vectors, axis = 0)
      
    # for every sentence in the post...
    for i in range(len(sentences)):
        # get the words of the sentence by means of tokanization
        # discarding punctuation marks
        words = []
        tokens = nltk.word_tokenize(sentences[i])
        for token in tokens:
            # only append actual words
            if(token[0] not in ".,:;[](){}!?-_`'~\"^/1234567890"):
                words.append(token)
                
        # store the word vectors into a sentence list
        encoded_sentence = []
                
        # turn the words into word vectors
        for word in words:
            # check if the word exists in the word2vec dictionary
            if(word in word2vec):
                    word_vector = word2vec[word]
             # else, map the word to a random, 300-dimensional vector
            else:
                word_vector = np.random.uniform(low = -0.01, high = 0.01, size = (300))
        
            # compute similarity between word and topic, then add this as the 
            # 301-th feature
            similarity = cosine_similarity([word_vector], [avg_topic_vector])
            word_vector = np.append(word_vector, similarity) 
            # add word to sentence list
            encoded_sentence.append(word_vector)
        encoded_sentence = np.asarray(encoded_sentence)
        encoded_sentences.append(encoded_sentence)
    # add encoded sentence to list of sentences      
    encoded_sentences = np.asarray(encoded_sentences)
        
    return encoded_sentences
        

    

In [8]:
def detect_arguments(model, post, topic):
    # split text into sentences
    post_sentences = post.split(".")
    # encode the sentences
    encoded_sentences = encode_post(post_sentences, topic)
     
    # create lists to store the classified arguments and non-arguments
    arguments = []
    non_arguments = []
    
    #print(len(post_sentences), len(encoded_sentences))
    #print(encoded_sentences[0])
    #print(encoded_sentences[1])
    
    # feed sentences into LSTM and get prediction
    for i in range(len(encoded_sentences)):
        n_words = encoded_sentences[i].shape[0]
        # skip empty sentences
        if(n_words > 0):
            n_features = encoded_sentences[i].shape[1]
            prediction = model.predict(encoded_sentences[i].reshape(1, n_words, n_features), batch_size=1, verbose=0)
            if(prediction > 0.5):
                arguments.append((i, prediction, post_sentences[i]))
            else:
                non_arguments.append((i, prediction, post_sentences[i]))
        else:
             non_arguments.append((i, prediction, post_sentences[i]))
            
    return arguments, non_arguments, encoded_sentences

In [20]:
OPname, OPtxt, CommentText, commentName, URL, TOPIC = search("Society")

In [21]:
topic = TOPIC.split(":")[1]
topic

' Society does not need to change how it treats gender'

In [22]:

#topic = "People who stay behind and try to ride out a hurricane, and later need to be rescued, should be billed for the cost of their rescue unless they can prove that they either had to stay for work or didn't have the means to leave."




OP_post = OPtxt
OP_post

'Astrology and horoscopes makes no sense. It is statistically innacurate to describe people’s traits based on their birth date and time. Theres no valid science behind it. It makes absolutely no sense to read daily horoscopes in newspapers. There is no rational way to explain how 1/12 of the population will inexplicably “meet the woman of your dreams” the same day. It is just smoke and mirrors. The people who write that nonsense basically overshoot and try to layout multiple guesses for gullible people to believe. Most the time it doesn’t hit anything and when it does most people are like “OMG! How did he knew!?” and right away completely ignore that 99% of the time it is wrong.I also hate it when people ask my sign and when I tell them usually (specially women) put this face like “oh I figured!” And they frame me into a stupid stereotype.I think people take it for granted that astrology is a fact, and never stop to think about it.I won’t tell you my sun sign. Want to guess?Edit: I may

In [23]:
arguments, non_arguments, encoded_sentences = detect_arguments(model, OP_post, topic)

In [24]:
arguments

[(1,
  array([[0.7049375]], dtype=float32),
  ' It is statistically innacurate to describe people’s traits based on their birth date and time'),
 (4,
  array([[0.9655996]], dtype=float32),
  ' There is no rational way to explain how 1/12 of the population will inexplicably “meet the woman of your dreams” the same day')]

In [25]:
non_arguments

[(0,
  array([[0.00327707]], dtype=float32),
  'Astrology and horoscopes makes no sense'),
 (2,
  array([[0.1595855]], dtype=float32),
  ' Theres no valid science behind it'),
 (3,
  array([[0.00989058]], dtype=float32),
  ' It makes absolutely no sense to read daily horoscopes in newspapers'),
 (5, array([[0.48778975]], dtype=float32), ' It is just smoke and mirrors'),
 (6,
  array([[0.18074086]], dtype=float32),
  ' The people who write that nonsense basically overshoot and try to layout multiple guesses for gullible people to believe'),
 (7,
  array([[0.14279671]], dtype=float32),
  ' Most the time it doesn’t hit anything and when it does most people are like “OMG! How did he knew!?” and right away completely ignore that 99% of the time it is wrong'),
 (8,
  array([[0.02493911]], dtype=float32),
  'I also hate it when people ask my sign and when I tell them usually (specially women) put this face like “oh I figured!” And they frame me into a stupid stereotype'),
 (9,
  array([[0.108

In [26]:
def claim_or_evidence(model, arguments, sentences, encoded_sentences):
    sentences = sentences.split(".")
    # classify every argument as evidence or claim
    classified_arguments = []
    for arg in arguments:
        # take the index from the tuple
        idx = arg[0]
        # take the encoded sentence corresponding to this
        # argument
        encoded_sentence = encoded_sentences[idx]
       
        n_words = encoded_sentence.shape[0]
        n_features = encoded_sentence.shape[1]
        score = model.predict(encoded_sentence.reshape(1, n_words, n_features), batch_size=1, verbose=0)
        print(score[0])
        # the LSTM is trained on discriminating between evidence sentences and non-evidence sentenves
        # ASSUMPTION: if the argument is not an evidence sentence, it's a claim sentence
        if(score >= 0.5):
            classified_arguments.append((score[0], "EVIDENCE", sentences[idx]))
        else:
            classified_arguments.append((score[0], "CLAIM", sentences[idx]))
    return classified_arguments

In [27]:
classified_arguments = claim_or_evidence(model2, arguments, OP_post, encoded_sentences)

[0.00424341]
[0.00402689]


In [28]:
classified_arguments

[(array([0.00424341], dtype=float32),
  'CLAIM',
  ' It is statistically innacurate to describe people’s traits based on their birth date and time'),
 (array([0.00402689], dtype=float32),
  'CLAIM',
  ' There is no rational way to explain how 1/12 of the population will inexplicably “meet the woman of your dreams” the same day')]

In [40]:
def get_argument_quality(model, arguments, sentences, encoded_sentences):
    sentences = sentences.split(".")
    # score each argument
    for arg in arguments:
        # take the index from the tuple
        idx = arg[0]
        # take the encoded sentence corresponding to this
        # argument
        encoded_sentence = encoded_sentences[idx]

        n_words = encoded_sentence.shape[0]
        n_features = encoded_sentence.shape[1]
        score = model.predict(encoded_sentence.reshape(1, n_words, n_features), batch_size=1, verbose=0)
        print(score[0], sentences[idx])
    
    

In [41]:
get_argument_quality(model3, arguments, OP_post, encoded_sentences)

[0.70816755]  It is statistically innacurate to describe people’s traits based on their birth date and time
[0.61725944]  There is no rational way to explain how 1/12 of the population will inexplicably “meet the woman of your dreams” the same day


In [None]:
arguments