# All notebooks as a single notebook

## PART 1 dataprep
reformat dataset for use with TensorFlow and API

In [1]:
!pip install pygithub
!pip install watson_developer_cloud
!wget https://github.com/PubChimps/think2019/blob/master/data/cloudclassifier.npy.zip?raw=true
!mv cloud* clouddata.npz.zip
!unzip clouddata.npz.zip

Collecting pygithub
  Downloading https://files.pythonhosted.org/packages/6d/13/6cf2d64c1de3a1d4892d69e4bea2039f920f373bc06b8962940ff5cde8bd/PyGithub-1.43.5.tar.gz (2.9MB)
[K    100% |████████████████████████████████| 2.9MB 352kB/s eta 0:00:01
[?25hRequirement not upgraded as not directly required: requests>=2.14.0 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from pygithub)
Collecting pyjwt (from pygithub)
  Downloading https://files.pythonhosted.org/packages/87/8b/6a9f14b5f781697e51259d81657e6048fd31a113229cf346880bb7545565/PyJWT-1.7.1-py2.py3-none-any.whl
Collecting Deprecated (from pygithub)
  Downloading https://files.pythonhosted.org/packages/a9/cb/c1a39ee51e3042df8b284e22c9c440ffad1c25f451bddd4bf9a8dc17cd75/Deprecated-1.2.4-py2.py3-none-any.whl
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests>=2.14.0->pygithub)
Requirement not upgraded as not directly required: idn

In [2]:
import re
import json
import nltk 
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.framework import ops
from watson_developer_cloud import NaturalLanguageClassifierV1

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/dsxuser/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
data = np.load('./cloudclassifier.npy')

#example data
print(data[0])

[ 'import subprocess print subprocess.check output . build and push.sh superradiance from scipy import sparse import numpy as np import os import pickle import matplotlib.pyplot as plt import boto3 from sagemaker import get execution role import sagemaker as sage steps 500 Number of simulated time steps N 8 Number of nuclear spins dim 2 N 1 Dimension of Hilbert space of N nuclear and 1 electronic spin Define Nuclear and Electron spin states as density matrices rhoI sparse.csr matrix 1. 0 0 shape int dim 2 int dim 2 All carbon atoms magnetically excited rhoS sparse.csr matrix 1. 1 1 shape 2 2 Electron magnetically excited sparse rho sparse.kron rhoS rhoI Build the system density matrix tempfile tmp tmp.pckl pickle.dump sparse rho open tempfile wb Upload serialized initial state to S3 resource boto3.resource s3 my bucket resource.Bucket sagemaker kessle31 subsitute this for your s3 bucket name. my bucket.upload file tempfile Key superradiance initial state init.pckl Clean up temporary fi

In [4]:
np.random.shuffle(data)
trainset, testset = np.array(data)[:int(len(data)*.8),:], np.array(data)[int(len(data)*.8):,:]

Build out dataset for neural network. Data needs to be tokenized and dictionary collected for bag of words encoding

In [5]:
words = []

labeledlines = []
ignore_words = ['?', ',']

for line in trainset:
    text = str(re.split(r'[.,]', line[0])).replace("'","").replace('[','')
    text = re.sub(r'\b\w{1,1}\b', '', text)
    w = nltk.word_tokenize(text)
    w = [ele for ele in w if ele not in ignore_words]
    words.extend(w)
    labeledlines.append([w, line[1]])
    
testlines = []

for line in testset:
    text = str(re.split(r'[.,]', line[0])).replace("'","").replace('[','')
    text = re.sub(r'\b\w{1,1}\b', '', text)
    w = nltk.word_tokenize(text)
    w = [ele for ele in w if ele not in ignore_words]
    words.extend(w)
    testlines.append([w, line[1]])
    

words = list(set(words))

print('each data example will be transformed into a feature vector of size')
print(len(words))

each data example will be transformed into a feature vector of size
13358


### Translate feature vector via bag of words encoding and label into one-hot encoding

In [6]:
traindata = []

for line in labeledlines:
    bag = []
    code = line[0]
    for w in words:
        bag.append(code.count(w)) 
        
    classes = [0,0,0,0]
    if line[1] == 'amazon':
        classes[0] = 1
    elif line[1] == 'ibm':
        classes[1] = 1
    elif line[1] == 'microsoft':
        classes[2] = 1
    elif line[1] == 'google':
        classes[3] = 1
    else:
        print(line[1])

    traindata.append([bag,classes])
    
testdata = []
for line in testlines:
    bag = []
    code = line[0]
    for w in words:
        bag.append(code.count(w)) 
        
    classes = [0,0,0,0]
    if line[1] == 'amazon':
        classes[0] = 1
    elif line[1] == 'ibm':
        classes[1] = 1
    elif line[1] == 'microsoft':
        classes[2] = 1
    elif line[1] == 'google':
        classes[3] = 1
    else:
        print(line[1])

    testdata.append([bag,classes])
    
x_train = np.array([row[0] for row in traindata]).T
y_train = np.array([row[1] for row in traindata]).T
x_test = np.array([row[0] for row in testdata]).T
y_test = np.array([row[1] for row in testdata]).T

np.save('x_train.npy',x_train)
np.save('y_train.npy',y_train)
np.save('x_test.npy',x_test)
np.save('y_test.npy',y_test)
np.save('testset.npy',testset)

Convert dataset to .csv file for API

In [7]:
d = []
for i in labeledlines:
    for j in re.findall('.{1,1024}', str(i[0]).replace(',', ' ').replace("'",'').replace('[','').replace(']','')):
        d.append({'text': j, 'cloud': i[1]})
        
df = pd.DataFrame(d, columns = ['text', 'cloud'])
df['text'].replace(' ', np.nan, inplace=True)
df = df.dropna()
df.to_csv('cloudtrainingdata.csv', header=False,index=False)

List files via !ls 
There should be a total of eight

In [8]:
!ls

cloudclassifier.npy  cloudtrainingdata.csv  x_test.npy	 y_test.npy
clouddata.npz.zip    testset.npy	    x_train.npy  y_train.npy


## PART 2 NAUTAL LANGUAGE CLASSIFIER API 

In [11]:
natural_language_classifier = NaturalLanguageClassifierV1(
    iam_apikey='YOUR API KEY',
    url='https://gateway.watsonplatform.net/natural-language-classifier/api')

In [12]:
with open('cloudtrainingdata.csv', 'rb') as training_data:
    classifier = natural_language_classifier.create_classifier(
        training_data=training_data,
        metadata='{"name": "pretrained think classifier","language": "en"}'
        ).get_result()
print(json.dumps(classifier, indent=2))

{
  "created": "2019-02-13T20:46:38.552Z",
  "language": "en",
  "status": "Training",
  "name": "pretrained think classifier",
  "url": "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/befe0fx502-nlc-641",
  "classifier_id": "befe0fx502-nlc-641",
  "status_description": "The classifier instance is in its training phase, not yet ready to accept classify requests"
}


In [18]:
CLASSIFIER_ID = 'YOUR CLASSIFIER'

## PART 3 TENSORFLOW

In [15]:
#initializing placeholders and parameters allows TensorFlow to build a dataflow graph
def placeholders(n_x, n_y):
    X = tf.placeholder(tf.float32, shape=(n_x, None), name=None)
    Y = tf.placeholder(tf.float32, shape=(n_y, None), name=None)
    
    return X, Y

def init_parameters():
    
    W1 = tf.get_variable("W1", [1024,13358], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
    b1 = tf.get_variable("b1", [1024,1], initializer = tf.zeros_initializer())
    W2 = tf.get_variable("W2", [512,1024], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
    b2 = tf.get_variable("b2", [512,1], initializer = tf.zeros_initializer())
    W3 = tf.get_variable("W3", [64,512], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
    b3 = tf.get_variable("b3", [64,1], initializer = tf.zeros_initializer())
    W4 = tf.get_variable("W4", [4,64], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
    b4 = tf.get_variable("b4", [4,1], initializer = tf.zeros_initializer())

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3,
                  "W4": W4,
                  "b4": b4}
    
    return parameters

#data is fed through a neural network via forward propagation in order to form a prediction
def forward_prop(X, parameters):
    
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    W4 = parameters['W4']
    b4 = parameters['b4']
    
    Z1 = tf.add(tf.matmul(W1,X), b1)                      
    A1 = tf.nn.relu(Z1)                                   
    Z2 = tf.add(tf.matmul(W2,A1), b2)                   
    A2 = tf.nn.relu(Z2)                                   
    Z3 = tf.add(tf.matmul(W3,A2), b3)  
    A3 = tf.nn.relu(Z3)
    Z4 = tf.add(tf.matmul(W4,A3), b4)
    
    return Z4

#measure probability error in predictions in order to alter and optimize weights
def compute_cost(Z4, Y):
   
    logits = tf.transpose(Z4)
    labels = tf.transpose(Y)
   
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels))
    
    return cost

In [16]:
def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.0001,
          num_epochs = 200, print_cost = True):
    
    ops.reset_default_graph()                         
    (n_x, m) = X_train.shape                          
    n_y = Y_train.shape[0]                            
    costs = []                                        
    
    #put the above functions together, and optimize weights' value and minimize cost via Adam optimizer and save trained parameters
    X, Y = placeholders(n_x, n_y)
    parameters = init_parameters()
    Z4 = forward_prop(X, parameters)
    cost = compute_cost(Z4, Y)
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    with tf.Session() as sess:

        sess.run(init)
        for epoch in range(num_epochs):
            _ , epoch_cost = sess.run([optimizer, cost], feed_dict={X: X_train, Y: Y_train})
            if print_cost == True and epoch % 20 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
            if print_cost == True and epoch % 5 == 0:
                costs.append(epoch_cost)
                
        parameters = sess.run(parameters)
        print ("Parameters have been trained!")

        correct_prediction = tf.equal(tf.argmax(Z4), tf.argmax(Y))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
        
        save_path = saver.save(sess, "params.ckpt",global_step=epoch)
        
        return parameters

In [17]:
parameters = model(x_train, y_train, x_test, y_test)

Cost after epoch 0: 1.743071
Cost after epoch 20: 0.161911
Cost after epoch 40: 0.066941
Cost after epoch 60: 0.042267
Cost after epoch 80: 0.030463
Cost after epoch 100: 0.022968
Cost after epoch 120: 0.017734
Cost after epoch 140: 0.013947
Cost after epoch 160: 0.011237
Cost after epoch 180: 0.009228
Parameters have been trained!
Train Accuracy: 0.998267


In [20]:
natural_language_classifier.get_classifier(CLASSIFIER_ID).get_result()

{'classifier_id': 'bef9eax500-nlc-99',
 'created': '2019-02-06T22:32:00.904Z',
 'language': 'en',
 'name': 'pretrained think classifier',
 'status': 'Available',
 'status_description': 'The classifier instance is now available and is ready to take classifier requests.',
 'url': 'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/bef9eax500-nlc-99'}

## PART 4 PREDICTIONS

In [21]:
watsonpred = []
for i in testset:
    x = natural_language_classifier.classify(CLASSIFIER_ID,re.sub(' +',' '," ".join(re.split(r'[^\w]', re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,i[0]))))[0:2048])
    watsonpred.append(x.get_result()['top_class'])

In [23]:
def compute_accuracy(pred, testdata):
    count = 0
    for i in range(len(pred)):
        if pred[i]==testdata[i][1]:
            count += 1
    return float(count)/len(pred)

In [24]:
with tf.Session() as sess:    
    X, Y = placeholders(x_test.shape[0], y_test.shape[0])
    saver = tf.train.import_meta_graph('params.ckpt-199.meta')
    saver.restore(sess,tf.train.latest_checkpoint('./'))
    parameters = {"W1": sess.run('W1:0'),
                  "b1": sess.run('b1:0'),
                  "W2": sess.run('W2:0'),
                  "b2": sess.run('b2:0'),
                  "W3": sess.run('W3:0'),
                  "b3": sess.run('b3:0'),
                  "W4": sess.run('W4:0'),
                  "b4": sess.run('b4:0')}
    Z4 = forward_prop(X, parameters)
    
    correct_prediction = tf.equal(tf.argmax(Z4), tf.argmax(Y))
    
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Test Accuracy Watson NLC API: " + str(compute_accuracy(watsonpred,testset)))
    print ("Test Accuracy TensorFlow:", accuracy.eval({X: x_test, Y: y_test}))

INFO:tensorflow:Restoring parameters from ./params.ckpt-199
Test Accuracy Watson NLC API: 0.9724137931034482
Test Accuracy TensorFlow: 0.855172
