In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import pickle
from sklearn.metrics import accuracy_score,roc_auc_score,precision_recall_fscore_support
import re
from nltk.util import ngrams
import nltk
import csv
import string

In [2]:
def split_test_validation(filename):
    raw_terms = pickle.load(open(filename,'rb')).drop(['TERM_VECTOR'], axis=1)
    terms = raw_terms
    #terms = raw_terms[raw_terms['OCCURENCES']>2]
    terms.drop(['NGRAM','OCCURENCES'], axis=1, inplace=True)
    bterm = terms[terms['BUSINESS']==1]['TERM']
    nbterm = terms[terms['BUSINESS']==0]['TERM']
    clean_term = terms[~((terms['BUSINESS']==0) & (terms['TERM'].isin(bterm))) & ~((terms['BUSINESS']==1) & (terms['TERM'].isin(nbterm)))]
    
    print(clean_term.shape)
    print(clean_term.groupby('BUSINESS')['BUSINESS'].count())
    
    business_terms = clean_term[clean_term['BUSINESS']==1]
    nonbusiness_terms = clean_term[clean_term['BUSINESS']==0]
    
    business_train, business_test_validation = train_test_split(business_terms, test_size = 0.3)
    business_test, business_validation = train_test_split(business_test_validation, test_size = 1/3)
    nonbusiness_train, nonbusiness_test_validation = train_test_split(nonbusiness_terms, test_size = 0.3)
    nonbusiness_test, nonbusiness_validation = train_test_split(nonbusiness_test_validation, test_size = 1/3)
    del business_test_validation
    del nonbusiness_test_validation
    
    #nonbusiness_train = nonbusiness_train.sample(n=len(business_train))
    #nonbusiness_validation = nonbusiness_validation.sample(n=len(business_validation))
    #nonbusiness_test = nonbusiness_test.sample(n=len(business_test))

    train_labels = pd.concat([business_train,nonbusiness_train]).drop(['BUSINESS'],axis=1)
    train_labels['SET'] = 'TRAIN'
    validation_labels = pd.concat([business_validation,nonbusiness_validation]).drop(['BUSINESS'],axis=1)
    validation_labels['SET'] = 'VALIDATION'
    test_labels = pd.concat([business_test,nonbusiness_test]).drop(['BUSINESS'],axis=1)
    test_labels['SET'] = 'TEST'
    
    train_validation_test = pd.concat([train_labels, validation_labels, test_labels])
    
    return train_validation_test

In [3]:
def create_features_labels(filename, vectortype, train_validation_test):
    raw_terms = pickle.load(open(filename,'rb')).rename(columns={'TERM_VECTOR':'FEATURE_VECTOR'})
    bterm = raw_terms[raw_terms['BUSINESS']==1]['TERM']
    nbterm = raw_terms[raw_terms['BUSINESS']==0]['TERM']
    raw_terms = raw_terms[~((raw_terms['BUSINESS']==0) & (raw_terms['TERM'].isin(bterm))) & ~((raw_terms['BUSINESS']==1) & (raw_terms['TERM'].isin(nbterm)))]
    
    if vectortype == 'tfidf':
        terms_count = raw_terms.groupby('TERM')['TERM'].count()
        terms =  raw_terms[raw_terms['TERM'].map(terms_count)>4]
        #terms['FEATURE_VECTOR'] = terms.apply(lambda x: [x['TERM_VECTOR{}'.format(y)] for y in range(0,10)],axis=1)
        terms.drop(['NGRAM','ROWNUM'], axis=1, inplace=True)
    else:
        terms = raw_terms
        #terms = raw_terms[raw_terms['OCCURENCES']>2]
        #terms['FEATURE_VECTOR'] = terms.apply(lambda x: [x['TERM_VECTOR{}'.format(y)] for y in range(0,10)],axis=1)
        #terms.drop(['TERM_VECTOR{}'.format(y) for y in range(0,10)]+['NGRAM','ROWNUM','TERMTYPE'], axis=1, inplace=True)
        terms.drop(['NGRAM','OCCURENCES'], axis=1, inplace=True)
    
    terms = terms.merge(train_validation_test, 'left', 'TERM')
    train_dataset =  terms[terms['SET']=='TRAIN'].drop(['SET'],axis=1)
    validation_dataset =  terms[terms['SET']=='VALIDATION'].drop(['SET'],axis=1)
    test_dataset =  terms[terms['SET']=='TEST'].drop(['SET'],axis=1)

    train_dataset = train_dataset.iloc[np.random.permutation(len(train_dataset))]
    validation_dataset = validation_dataset.iloc[np.random.permutation(len(validation_dataset))]
    test_dataset = test_dataset.iloc[np.random.permutation(len(test_dataset))]
    
    print("Train-")
    print(train_dataset.groupby('BUSINESS')['BUSINESS'].count())
    print("Validation-")
    print(validation_dataset.groupby('BUSINESS')['BUSINESS'].count())
    print("Test-")
    print(test_dataset.groupby('BUSINESS')['BUSINESS'].count())
    
    train_features = np.vstack(train_dataset['FEATURE_VECTOR'].map(lambda x : np.array([0.000001 if y==0 else y for y in x]) ).values)
    train_labels = np.vstack(train_dataset['BUSINESS'].values)
    validation_features = np.vstack(validation_dataset['FEATURE_VECTOR'].map(lambda x : np.array([0.000001 if y==0 else y for y in x]) ).values)
    validation_labels = np.vstack(validation_dataset['BUSINESS'].values)
    test_features = np.vstack(test_dataset['FEATURE_VECTOR'].map(lambda x : np.array([0.000001 if y==0 else y for y in x]) ).values)
    test_labels = np.vstack(test_dataset['BUSINESS'].values)
    
    return train_features, train_labels, validation_features, validation_labels,  test_features, test_labels


# Create Network Architecture

In [4]:
# Network Parameters
def create_architecture(n_input, layers):

    hidden_features = {
        10:[7,5,3,2],
        1000:[666,444,300,200]
    }

    n_hidden_1 = hidden_features[n_input][0] # 1st layer number of features
    n_hidden_2 = hidden_features[n_input][1] # 2nd layer number of features
    n_hidden_3 = hidden_features[n_input][2] # 3rd layer number of features
    n_hidden_4 = hidden_features[n_input][3] # 3rd layer number of features
    n_classes = 1 # total classes (binary)

    # Create model
    def multilayer_perceptron(layers, x, weights, biases):
        # Hidden layer with RELU activation
        layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
        layer_1 = tf.nn.relu(layer_1)

        layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
        layer_2 = tf.nn.relu(layer_2)

        layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
        layer_3 = tf.nn.relu(layer_3)

        layer_4 = tf.add(tf.matmul(layer_3, weights['h4']), biases['b4'])
        layer_4 = tf.nn.relu(layer_4)
            
        #Output layer with linear activation
        out_layer = {
            1: tf.matmul(layer_1, weights['out1']) + biases['out'],
            2: tf.matmul(layer_2, weights['out2']) + biases['out'],
            3: tf.matmul(layer_3, weights['out3']) + biases['out'],
            4: tf.matmul(layer_4, weights['out4']) + biases['out']
                     }
        return out_layer[layers]

    # Store layers weight & bias
    weights = {
        'h1': tf.Variable(tf.random_normal([n_input,n_hidden_1],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_input+
                                                             n_hidden_1+1)),
                                           name="h1")),
        'h2': tf.Variable(tf.random_normal([n_hidden_1,n_hidden_2],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_1+
                                                             n_hidden_2+1)),
                                           name="h2")),
        'h3': tf.Variable(tf.random_normal([n_hidden_2,n_hidden_3],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_2+
                                                             n_hidden_3+1)),
                                           name="h3")),
        'h4': tf.Variable(tf.random_normal([n_hidden_3,n_hidden_4],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_3+
                                                             n_hidden_4+1)),
                                           name="h4")),
        'out1': tf.Variable(tf.random_normal([n_hidden_1,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_1+
                                                             n_classes+1)),
                                           name="out")),
        'out2': tf.Variable(tf.random_normal([n_hidden_2,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_2+
                                                             n_classes+1)),
                                           name="out")),
        'out3': tf.Variable(tf.random_normal([n_hidden_3,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_3+
                                                             n_classes+1)),
                                           name="out")),
        'out4': tf.Variable(tf.random_normal([n_hidden_4,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_4+
                                                             n_classes+1)),
                                           name="out"))
    }
    biases = {
        'b1': tf.Variable(tf.random_normal([n_hidden_1],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_1+1)),
                                        name="b1")),
        'b2': tf.Variable(tf.random_normal([n_hidden_2],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_2+1)),
                                        name="b2")),
        'b3': tf.Variable(tf.random_normal([n_hidden_3],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_3+1)),
                                        name="b3")),
        'b4': tf.Variable(tf.random_normal([n_hidden_4],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_4+1)),
                                        name="b4")),
        'out': tf.Variable(tf.random_normal([n_classes],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_classes+1)),
                                        name="biasout"))
    }
    
    # Parameters
    learning_rate = 0.001 # the alpha

    # tf Graph input
    x = tf.placeholder("float", [None, n_input])
    y = tf.placeholder("float", [None, n_classes])

    # Construct model
    pred = multilayer_perceptron(layers, x, weights, biases)

    # Define loss and optimizer
    cost = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(pred, y))
    #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
    #cost = tf.nn.l2_loss(pred-y, name="squared_error_cost")
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    predict_op = tf.nn.sigmoid(pred)
    
    return x, y, cost, optimizer, predict_op


In [5]:
from bokeh.io import push_notebook, show, output_notebook, save
from bokeh.plotting import figure
from bokeh.layouts import gridplot
output_notebook()

epoch_values=[]
cost_values=[]
training_accuracy_values=[]
validation_accuracy_values=[]

p_cost = figure(title="Average Training Cost Graph", plot_height=300, plot_width=600)
r_cost = p_cost.line(epoch_values, cost_values, color="navy", line_width=1, legend="Avg Cost")

p_accuracy = figure(title="Training/Validation Accuracy Graph", plot_height=300, plot_width=600)
r_train_accuracy = p_accuracy.line(epoch_values, training_accuracy_values, color="navy", line_width=1, legend="Training")
r_test_accuracy = p_accuracy.line(epoch_values, validation_accuracy_values, color="firebrick", line_width=1, legend="Validation")

grid = gridplot([[p_cost],[p_accuracy]])

def update(newx, newcost, newtrain, newtest):
    r_cost.data_source.data['x'] = newx
    r_cost.data_source.data['y'] = newcost
    
    r_train_accuracy.data_source.data['x'] = newx
    r_train_accuracy.data_source.data['y'] = newtrain
    
    r_test_accuracy.data_source.data['x'] = newx
    r_test_accuracy.data_source.data['y'] = newtest
    push_notebook()

show(grid, notebook_handle=True)

In [6]:
def run_training_testing(modelname, filename, vectortype, train_validation_test, layers):
    if vectortype == 'word2vec':
        n_input = 1000
    else:
        n_input = 10
    
    max_epochs = 1000
    display_epoch_step = 1
    batch_size = 100
    
    x, y, cost, optimizer, predict_op = create_architecture(n_input, layers)
    
    train_features, train_labels, validation_features, validation_labels,  test_features, test_labels = create_features_labels(filename, vectortype, train_validation_test)
    
    # Initializing the variables
    init = tf.initialize_all_variables()
    
    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    # Launch the graph
    current_row = 0 
    num_examples = len(train_labels)
    print('number of samples - {}'.format(num_examples))

    prev_validation_accuracy = 0
    MAX_FAIL = 3
    
    classify_func = np.vectorize(lambda x: 1 if x >=0.5 else 0)

    with tf.Session() as sess:
        sess.run(init)

        # Training cycle
        print('running for maximum of {epochs} epochs or validation failed {MAX_FAIL} times'.format(
                epochs=max_epochs, 
                MAX_FAIL=MAX_FAIL))

        epoch = 0
        prev_cost = None
        epsilon = 999
        fail_count = 0
        while True:
            
            if epoch >= max_epochs:
                break
            epoch += 1

            epoch_cost = 0.
            total_batch = int(num_examples/batch_size)
            # Loop over all batches
            global current_row
            current_row = 0
            for i in range(total_batch):

                global current_row
                start = current_row
                current_row += batch_size
                end = current_row

                """
                print(start,end)
                if i >= 10:
                    break"""

                batch_x = train_features[start:end]
                batch_y = train_labels[start:end]

                # Run optimization op (backprop) and cost op (to get loss value)
                sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

                c = sess.run(cost, feed_dict = {x: batch_x, y: batch_y})
                #_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
                # Compute average loss
                epoch_cost += c
            # Display logs per epoch step
            current_cost = epoch_cost/total_batch
            epsilon = ((prev_cost - current_cost) if prev_cost else current_cost)
            prev_cost = current_cost
            
            training_prediction = sess.run(predict_op, feed_dict = {x: train_features, y: train_labels})
            training_accuracy = accuracy_score(train_labels, classify_func(training_prediction))
            validation_prediction = sess.run(predict_op, feed_dict = {x: validation_features, y: validation_labels})
            validation_accuracy = accuracy_score(validation_labels, classify_func(validation_prediction))
            
            if epoch % display_epoch_step == 0:
                print("Epoch:", '%04d' % (epoch), 
                      "cost=", "{:.9f}".format(current_cost),
                      "training accuracy= {}".format(training_accuracy),
                      "validation accuracy= {}".format(validation_accuracy)
                     )
            
            # Add values to graph lists
            epoch_values.append(epoch)
            cost_values.append(current_cost)
            training_accuracy_values.append(training_accuracy)
            validation_accuracy_values.append(validation_accuracy)
            update(epoch_values,cost_values,training_accuracy_values,validation_accuracy_values)
            
            
            if validation_accuracy > prev_validation_accuracy:
                print("saving new best validation model - {}".format(validation_accuracy))
                saver.save(sess, "bestmodel.ckpt")
                fail_count = 0
            else:
                print("reduced validation acc!")
                fail_count += 1
                if fail_count >= MAX_FAIL:
                    break
                    
            prev_validation_accuracy =  validation_accuracy
                    
        print("Optimization Finished!")
        
        saver.restore(sess, "bestmodel.ckpt")
        
        predicts, cost_ = sess.run([predict_op, cost], feed_dict = {x: test_features, y: test_labels})
        print(predicts)
        final_accuracy = accuracy_score(test_labels, classify_func(predicts))
        final_auc = roc_auc_score(test_labels, predicts)
        final_prfs = precision_recall_fscore_support(test_labels, classify_func(predicts), average ='binary')
        final_cost = cost_ / len(test_labels)
        
        print('final results:',
              'accuracy:', final_accuracy,
              'auc:', final_auc,
              'cost:', final_cost,
              'precision,recall,fscore,support',final_prfs,
              'epoch:', epoch
             )
        
        with open('test_results.txt','a') as wfile:
            wfile.write('modelname - {modelname}, accuracy: {accuracy}, auc: {auc}, prfs: {prfs}, cost: {cost}, epoch: {epoch}'.format(
                    modelname = modelname,
                    accuracy = final_accuracy,
                    auc = final_auc,
                    cost = final_cost,
                    prfs = final_prfs,
                    epoch = epoch
                ) + '\n')
        # Test model
        #correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
        # Calculate accuracy
        #accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        #print("Accuracy:", accuracy.eval({x: train_features, y: train_labels}))

  global current_row
  global current_row


In [7]:
train_validation_test = split_test_validation('space_ngram_30krows_word2vec_05_threshold_new.pickle')

epoch_values=[]
cost_values=[]
training_accuracy_values=[]
validation_accuracy_values=[]
update(epoch_values,cost_values,training_accuracy_values,validation_accuracy_values)
model_name = 'space_word2vec_model_05_layers3'

run_training_testing(model_name, 'space_ngram_30krows_word2vec_05_threshold_new.pickle', 'word2vec', train_validation_test, 3)
save(grid, 'chart_{}.html'.format(model_name))

(180023, 2)
BUSINESS
0    129253
1     50770
Name: BUSINESS, dtype: int64
Train-
BUSINESS
0    90477
1    35539
Name: BUSINESS, dtype: int64
Validation-
BUSINESS
0    12926
1     5077
Name: BUSINESS, dtype: int64
Test-
BUSINESS
0    25850
1    10154
Name: BUSINESS, dtype: int64
number of samples - 126016
running for maximum of 1000 epochs or validation failed 3 times
Epoch: 0001 cost= 26663110658.031745911 training accuracy= 0.7401917216861351 validation accuracy= 0.7412653446647781
saving new best validation model - 0.7412653446647781
Epoch: 0002 cost= 13329209258.869840622 training accuracy= 0.77554437531742 validation accuracy= 0.7730378270288285
saving new best validation model - 0.7730378270288285
Epoch: 0003 cost= 9335097841.168254852 training accuracy= 0.7929310563737938 validation accuracy= 0.7891462534022108
saving new best validation model - 0.7891462534022108
Epoch: 0004 cost= 7018162674.692063332 training accuracy= 0.8023901726764855 validation accuracy= 0.7963672721213131




'/mnt/chart_space_word2vec_model_05_layers3.html'

In [None]:
filename= 'space_ngram_30krows_word2vec_10_threshold.pickle'
x, y, cost, optimizer, predict_op = create_architecture(1000, 3)
train_validation_test = split_test_validation(filename)
train_features, train_labels, validation_features, validation_labels,  test_features, test_labels = create_features_labels(filename, 'word2vec', train_validation_test)
saver = tf.train.Saver()
classify_func = np.vectorize(lambda x: 1 if x >=0.5 else 0)

In [8]:
with tf.Session() as sess:
    saver.restore(sess, "bestmodel.ckpt")
    print("Model restored.")
    predicts, cost_ = sess.run([predict_op, cost], feed_dict = {x: test_features, y: test_labels})
    print(predicts)
    final_accuracy = accuracy_score(test_labels, classify_func(predicts))
    final_auc = roc_auc_score(test_labels, predicts)
    final_cost = cost_ / len(test_labels)

    print('final results:',
          'accuracy:', final_accuracy,
          'auc:', final_auc,
          'cost:', final_cost
         )

Model restored.
[[ 0.]
 [ 1.]
 [ 1.]
 ..., 
 [ 0.]
 [ 0.]
 [ 1.]]
final results: accuracy: 0.916266575217 auc: 0.916266575217 cost: 10831214.6173


In [8]:
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
nltk.download(info_or_id='stopwords')
nltk.download(info_or_id='punkt')
sw = set(stopwords.words("english")) | set(i.strip() for i in list(open('stopwordlist.txt'))[1:])
terms_df = pd.read_csv('finalterms_grouped_clean2.csv')
single_terms = set(b for a in terms_df['term'] for b in nltk.word_tokenize(a) if (b not in sw and len(b)> 1))
del stopwords
del terms_df
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
puncs = set(string.punctuation)
tokenizer = WhitespaceTokenizer()
def return_words_from_text(x):
    def return_tokens(fulltext,start,end):
        text = fulltext[start:end]
        return tuple( [
            tuple(t.strip(string.punctuation) for t in tokenizer.tokenize(re.sub(r'\d+','SOMENUM',text.lower()))), 
            tuple((t[0]+start,t[1]+start) for t in tokenizer.span_tokenize(text))
                ]  )
    return tuple(return_tokens(x.strip(), sent[0], sent[1]) for sent in sent_detector.span_tokenize(x.strip()))

def get_ngrams(x, ngram_start, ngram_end):
    def get_ngram(sent, i):
        span = [(s[0][0],s[-1][-1]) for s in ngrams(sent[1], i)]
        the_ngrams = list(ngrams(sent[0], i))
        #print(len(span))
        #print(len(the_ngrams))
        if len(the_ngrams) > 0:
            return zip(the_ngrams, span)
        else:
            return []
    
    def remove_punc(ngram):
        return tuple([tuple(gram for gram in ngram[0] if not all(j in puncs for j in gram)), ngram[1]])
    def check_num_only(ngram):
        #print(ngram)
        return set(ngram[0]) != set('SOMENUM')
          
    ngram_list = defaultdict(list)
    for sent in x:
        #print(sent)
        for i in range(ngram_start,ngram_end+1):
            for gram in map(remove_punc, get_ngram(sent, i)):
                if ~check_num_only(gram):
                    ngram_list[gram[0]].append(gram[1])
    return ngram_list

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from gensim.models import Word2Vec
model = Word2Vec.load("space_100features_5minwords_10context_300karticles.bin")
allwords = set(model.index2word)
def get_vector(term):
    vector = [model[unigram] if unigram in allwords else np.zeros(100) for unigram in term.lower().split(' ') ]
    return np.concatenate(vector + [np.zeros(100)]*(10-len(vector)), axis=0)

INFO:gensim.utils:Pattern library is not installed, lemmatization won't be available.
INFO:gensim.corpora.sharded_corpus:Could not import Theano, will use standard float for default ShardedCorpus dtype.
INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English
INFO:gensim.utils:loading Word2Vec object from space_100features_5minwords_10context_300karticles.bin
INFO:gensim.utils:loading syn1neg from space_100features_5minwords_10context_300karticles.bin.syn1neg.npy with mmap=None
INFO:gensim.utils:loading syn0 from space_100features_5minwords_10context_300karticles.bin.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:setting ignored attribute syn0norm to None


In [10]:
new_text = """AT&T will pay $107.50 for each Time Warner share, in a combination of cash and stock, worth $85.4bn overall, according to a statement.
AT&T said it expected to close the deal to be completed by the end of 2017.
Other media company shares, including Discovery, AMC, Netflix and CBS, recently rose as investors speculated that a deal could spark a fresh wave of takeovers and mergers among media and technology companies.
AT&T, which has a market value of about $238bn, has already made moves to turn itself into a media powerhouse, buying satellite TV provider DirecTV last year for $48.5bn.
Time Warner chief executive Jeff Bewkes has, however, resisted selling in the past. The company rejected an $80bn offer from Twenty-First Century Fox Inc in 2014.
The deal gives AT&T access to a major producer of content as it seeks to diversify away from its core telecoms business. Rival Verizon is currently in negotiations to buy Yahoo and has already bought AOL, owner of Huffington Post.
Some analysts, however, question whether AT&T needs to mount a complete takeover of Time Warner.
"""
#new_text = new_text.replace("\n"," ")
ngram_of_article = get_ngrams(return_words_from_text(new_text), 1, 10)

In [11]:
tf.reset_default_graph()
x, y, cost, optimizer, predict_op = create_architecture(1000, 3)
saver = tf.train.Saver()
classify_func = np.vectorize(lambda x: 1 if x >=0.5 else 0)
with tf.Session() as sess:
    saver.restore(sess, "bestmodel_10_new3.ckpt")
    print("Model restored.")
    
    for i in ngram_of_article:
        vector = np.array([0.000001 if y==0 else y for y in get_vector(i[0]) ])
        #print(ngram_of_article[i][1])
        predicts = sess.run(predict_op, feed_dict = {x: [vector]})
        ngram_of_article[i] = (ngram_of_article[i], vector, classify_func(predicts)[0][0])

Model restored.


In [16]:
bold_range = np.zeros(len(new_text))
bold_range.shape
for gram in ngram_of_article:
    if ngram_of_article[gram][2]==1:
        print(gram)
        #print(ngram_of_article[gram][0])
        for span in ngram_of_article[gram][0]:
            bold_range[span[0]:span[1]] = 1

('turn', 'itself', 'into')
('pay', 'SOMENUM.SOMENUM', 'for', 'each', 'time')
('value', 'of', 'about', 'SOMENUMbn')
('deal', 'could', 'spark', 'a', 'fresh', 'wave', 'of', 'takeovers')
('pay', 'SOMENUM.SOMENUM', 'for', 'each', 'time', 'warner')
('share', 'in', 'a', 'combination', 'of', 'cash', 'and', 'stock', 'worth', 'SOMENUM.SOMENUMbn')
('investors', 'speculated', 'that', 'a')
('pay', 'SOMENUM.SOMENUM', 'for', 'each')
('negotiations', 'to', 'buy')
('end', 'of', 'SOMENUM')
('market',)
('buy', 'yahoo', 'and', 'has', 'already', 'bought')
('offer', 'from', 'twenty-first', 'century', 'fox', 'inc')
('SOMENUMbn', 'has', 'already', 'made', 'moves', 'to', 'turn', 'itself', 'into')
('value', 'of', 'about', 'SOMENUMbn', 'has', 'already', 'made', 'moves', 'to')
('expected', 'to', 'close')
('discovery', 'amc', 'netflix', 'and', 'cbs', 'recently', 'rose', 'as', 'investors', 'speculated')
('past',)
('takeover', 'of')
('moves', 'to', 'turn', 'itself', 'into')
('time', 'warner', 'share', 'in', 'a', 'co

In [17]:
len(bold_range)

1082

In [18]:
sum(bold_range)

825.0

In [19]:
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML
html_string = ""
temp_bold = ''
for idx,i in enumerate(bold_range):
    if i==1:
        temp_bold += new_text[idx]
    else:
        if temp_bold:
            html_string += "<b>{}</b>".format(temp_bold)
            temp_bold = ""
        html_string += new_text[idx]
display(HTML(html_string))

In [62]:
html_string

"<b>South Korea registered a trade deficit of $101 million in October, reflecting the country's</b> <b>economic sluggishness, according to government figures released Wednesday.</b> Preliminary tallies by the <b>Trade and Industry Ministry showed another trade deficit in October, the fifth monthly setback this year, casting a cloud on South Korea's export-oriented economy.</b> Exports in October <b>stood at $5.29 billion, a mere 0.7% increase from a</b> <b>year earlier, while imports increased sharply to $5.39 billion, up</b> 20% from last October. <b>South Korea's economic boom, which began in 1986, stopped this year because of prolonged labor disputes, trade conflicts and sluggish exports.</b> <b>Government officials said exports at the end of the year would remain under a government target of $68 billion.</b> Despite the gloomy forecast, <b>South Korea has recorded a trade surplus of $71 million so far this year.</b> From January to October, the nation's <b>accumulated exports incre

In [None]:
for i in [3,4,5,6,7,8,9]:
    train_validation_test = split_test_validation('ngram_10krows_word2vec_0{}_threshold.pickle'.format(i))
    for n_layer in [3]:
        
        epoch_values=[]
        cost_values=[]
        training_accuracy_values=[]
        validation_accuracy_values=[]
        update(epoch_values,cost_values,training_accuracy_values,validation_accuracy_values)
        model_name = 'word2vec_model_0{}_layers{}'.format(i, n_layer)

        run_training_testing(model_name, 'ngram_10krows_word2vec_0{}_threshold.pickle'.format(i),'word2vec', train_validation_test, n_layer)
        save(grid, 'chart_{}.html'.format(model_name))

(407786, 2)
BUSINESS
0    319073
1     88713
Name: BUSINESS, dtype: int64
Train-
BUSINESS
0    62099
1    62099
Name: BUSINESS, dtype: int64
Validation-
BUSINESS
0    8872
1    8872
Name: BUSINESS, dtype: int64
Test-
BUSINESS
0    17742
1    17742
Name: BUSINESS, dtype: int64
number of samples - 124198
running for maximum of 1000 epochs or validation failed 6 times
Epoch: 0001 cost= 30735937039.265110016 training accuracy= 0.6617658899499187 validation accuracy= 0.6601104598737602
saving new best validation model - 0.6601104598737602
Epoch: 0002 cost= 16694745769.566478729 training accuracy= 0.6952044316333597 validation accuracy= 0.6946009918845807
saving new best validation model - 0.6946009918845807
Epoch: 0003 cost= 11850029181.421434402 training accuracy= 0.7112513889112546 validation accuracy= 0.7067741208295762
saving new best validation model - 0.7067741208295762
Epoch: 0004 cost= 9006856486.575342178 training accuracy= 0.7240132691347687 validation accuracy= 0.717143823264202


In [25]:
train_validation_test = split_test_validation('ngram_10krows_ml_kn.pickle')
for i in ['ngram_10krows_ml_kn.pickle','ngram_10krows_tfidf.pickle','ngram_10krows_word2vec_05_threshold.pickle']:
    for n_layer in [1,2,3,4]:
        epoch_values=[]
        cost_values=[]
        training_accuracy_values=[]
        validation_accuracy_values=[]
        
        update(epoch_values,cost_values,training_accuracy_values,validation_accuracy_values)
        model_name = '{}_layers{}'.format(i.replace('.pickle',''), n_layer)
        
        run_training_testing(model_name, i, re.search('ngram_10krows_([^_\.]*)[_\.]',i).groups(1)[0], train_validation_test, n_layer)
        save(grid, 'chart_{}.html'.format(model_name))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


number of samples - 59760
running for maximum of 1000 epochs or validation failed 6 times
Epoch: 0001 cost= 85.851437605 training accuracy= 0.5661646586345381 validation accuracy= 0.5124631888935633
saving new best validation model - 0.5124631888935633
Epoch: 0002 cost= 71.494304478 training accuracy= 0.5732597054886212 validation accuracy= 0.5154606647034077
saving new best validation model - 0.5154606647034077
Epoch: 0003 cost= 68.367247379 training accuracy= 0.5825635876840696 validation accuracy= 0.5298695835086243
saving new best validation model - 0.5298695835086243
Epoch: 0004 cost= 67.396349065 training accuracy= 0.5880354752342705 validation accuracy= 0.5378365586874211
saving new best validation model - 0.5378365586874211
Epoch: 0005 cost= 67.014964361 training accuracy= 0.5924196787148595 validation accuracy= 0.5497212873369793
saving new best validation model - 0.5497212873369793
Epoch: 0006 cost= 66.809565648 training accuracy= 0.5942938420348058 validation accuracy= 0.557



number of samples - 59760
running for maximum of 1000 epochs or validation failed 6 times
Epoch: 0001 cost= 82.922028316 training accuracy= 0.5880856760374833 validation accuracy= 0.5976020193521245
saving new best validation model - 0.5976020193521245
Epoch: 0002 cost= 68.867684363 training accuracy= 0.5918674698795181 validation accuracy= 0.6215555321834245
saving new best validation model - 0.6215555321834245
Epoch: 0003 cost= 67.415122404 training accuracy= 0.5932898259705489 validation accuracy= 0.6348601177955406
saving new best validation model - 0.6348601177955406
Epoch: 0004 cost= 66.974214238 training accuracy= 0.5937248995983936 validation accuracy= 0.6422749263777872
saving new best validation model - 0.6422749263777872
Epoch: 0005 cost= 66.690780966 training accuracy= 0.5939591700133868 validation accuracy= 0.6503733697938578
saving new best validation model - 0.6503733697938578
Epoch: 0006 cost= 66.478370852 training accuracy= 0.5956827309236947 validation accuracy= 0.657

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


number of samples - 300195
running for maximum of 1000 epochs or validation failed 6 times
Epoch: 0001 cost= 75.172118692 training accuracy= 0.5985276237112543 validation accuracy= 0.5965834077906838
saving new best validation model - 0.5965834077906838
Epoch: 0002 cost= 67.024654969 training accuracy= 0.6041206549076433 validation accuracy= 0.584101244215664
reduced validation acc!
Epoch: 0003 cost= 66.346618721 training accuracy= 0.605826212961575 validation accuracy= 0.5800805472948645
reduced validation acc!
Epoch: 0004 cost= 65.895954899 training accuracy= 0.6088209330601776 validation accuracy= 0.5736194273674103
reduced validation acc!
Epoch: 0005 cost= 65.532518701 training accuracy= 0.6117290427888539 validation accuracy= 0.5702588448664435
reduced validation acc!
Epoch: 0006 cost= 65.252282060 training accuracy= 0.6124785556055231 validation accuracy= 0.5672116500193367
reduced validation acc!
Epoch: 0007 cost= 65.055188416 training accuracy= 0.6140575292726395 validation acc

In [1]:
from gensim.models import Word2Vec
model = Word2Vec.load_word2vec_format('/media/sf_vmSharedFolder/GoogleNews-vectors-negative300.bin',binary=True)
model.similarity('king','queen')

### ML(smoothing) vs TFIDF vs Word2Vec , know the limit
### run till convergence
### grid search

In [38]:
from sklearn.svm import LinearSVC 

In [39]:
clf = LinearSVC()
clf.fit(train_features, train_labels.ravel()) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [40]:
clf.score(test_features, test_labels.ravel())

0.64104818853323953