In [667]:
import numpy as np
import json
import sys
import re
from collections import *
import operator
import nltk

In [None]:
def parsenb(dataset):
    parseddataset = []
   
    for i in dataset:
        code = ''
        try:
            parsednb = (json.loads(i[1]))
        except:
            print(sys.exc_info())
        for j in range(len(parsednb['cells'])):
            if parsednb['cells'][j]['cell_type'] == 'code':
                code = code + ''.join(parsednb['cells'][j]['source'])
        parseddataset.append([code, i[2]])
    
    for i in range(len(parseddataset)):
        parseddataset[i][0] = re.sub('[^a-zA-Z0-9 \n\.]', ' ', parseddataset[i][0]).replace('\n', ' ')
    return parseddataset

In [731]:
def bayes_train(pldict, samples):
    plprobs = {}
    counts = Counter()
    for i in pldict.keys():
        plprobs[i] = float(len(pldict[i]))/samples
        
    plwordprobs = {}
    plwordcounts = {}
    for pl in pldict.keys():
        plwordprobs[pl] = {}
        plwordcounts[pl] = 0
    
    for pl in pldict.keys():
        for i in pldict[pl]:
            counts.update(filter(None, re.split(r'[^\w]', re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,i))))
            for word in counts.keys():
                if word not in plwordprobs[pl]:
                    plwordprobs[pl][word] = counts[word]
                else:
                    plwordprobs[pl][word] += counts[word]
                plwordcounts[pl] += counts[word]
            plwordcount = 0
            counts = Counter()
    for pl in plwordprobs.keys():   
        for word in plwordprobs[pl]:
            plwordprobs[pl][word] = float(plwordprobs[pl][word])/plwordcounts[pl]
        
    return plprobs, plwordprobs
    
def bayes_test(testdata,plprob,plwordprob):
    Ypred = []

    for row in testdata:
        testcounter = Counter()
        testcounter.update(filter(None, re.split(r'[^\w]', re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,str(row[0])))))

        prob = {}
        for key in plprob.keys():
            prob[key] = 0
        for key in prob.keys():
            for i in testcounter:
                if i not in plwordprobs[key]:
                    plwordprob[key][i] = 1e-4
                else:
                    plwordprob[key][i] += 1e-4
                prob[key] += testcounter[i]*np.log(plwordprob[key][i])
            prob[key] += np.log(plprob[key])
        Ypred.append(max(prob.items(), key=operator.itemgetter(1))[0])
    
    return Ypred

In [468]:
def initialize_parameters_deep(layer_dims):
    
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)            # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

        
    return parameters

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def forward_propagation(X, parameters):
    A = X

    cache = {}
    for i in range(1,len(params)//2):
        Aprev = A
        Z = np.dot(parameters['W' + str(i)], Aprev) + parameters['b' + str(i)]
        A = np.tanh(Z)
        cache['Z' + str(i)] = Z
        cache['A' + str(i)] = A
        
    ZL = np.dot(parameters['W' + str(len(params)//2)], A) + parameters['b' + str(len(params)//2)]
    AL = softmax(ZL)
    cache['Z' + str(len(params)//2)] = ZL
    cache['A' + str(len(params)//2)] = AL
    
    return AL, cache

def cross_entropy(A,Y):
    m = Y.shape[1]
    logprobs = np.multiply(np.log(A), Y) + np.multiply((1 - Y), np.log(1 - A))
    cost = - np.sum(logprobs) / m 
    
    return np.squeeze(cost)

def backward_propagation(parameters, cache, X, Y):
    
    grads = {}
    
    cache['A0'] = X
    m = X.shape[1]
    
    dZ = cache['A' + str(len(params)//2)] - Y
    dWL = (1 / m) * np.dot(dZ, cache['A' + str(len(params)//2 - 1)].T)
    dbL = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    grads['dW' + str(len(params)//2)] = dWL
    grads['db' + str(len(params)//2)] = dbL
    
    for i in reversed(range(len(params)//2)[1::]):
        dZprev = dZ
        dZ = np.multiply(np.dot(parameters['W' + str(i+1)].T, dZprev), 1 - np.power(cache['A' + str(i)], 2))
        dW = (1 / m) * np.dot(dZ, cache['A' + str(i-1)].T)
        db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
        
        grads['dW' + str(i)] = dW
        grads['db' + str(i)] = db
        
    return grads

def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)]
      
    return parameters

In [745]:
def compute_accuracy(pred, testdata):
    count = 0
    for i in range(len(pred)):
        if pred[i]==testdata[i][1]:
            count += 1
    return float(count)/len(pred)


def entropy(predlist):
    ent = 0
    if predlist.count('aws') != 0:
        ent -= predlist.count('aws')/len(predlist)*np.log2(predlist.count('aws')/len(predlist))
    if predlist.count('ibm') != 0:
        ent -= predlist.count('ibm')/len(predlist)*np.log2(predlist.count('ibm')/len(predlist))
    if predlist.count('ms') != 0:
        ent -= predlist.count('ms')/len(predlist)*np.log2(predlist.count('ms')/len(predlist))
    return ent

def caticc(lista, listb, listc):
    diff = []
    for i in range(len(lista)):
        if lista[i] == listb[i] == listc[i]:
            diff.append(0)
        elif lista[i] == listb[i] or listb[i] == listc[i] or lista[i] == listc[i]:
            diff.append(1)
        else:
            diff.append(4)
    
    return np.sum(diff)/(np.sum(diff) + entropy(lista) + entropy(listb) + entropy(listc))

In [290]:
dataset2 = np.load('cloudata.npy')
dataset2 = parsenb(dataset2)
np.random.shuffle(dataset2)
trainset2, testset2 = np.array(dataset2)[:int(len(dataset2)*.8),:], np.array(dataset2)[int(len(dataset2)*.8):,:]

(<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 2 column 1 (char 1)',), <traceback object at 0x1a3cd9c188>)


In [738]:
words2 = []

labeledlines2 = []
ignore_words = ['?', ',']

for line in trainset2:
    text = str(re.split(r'[.,]', line[0])).replace("'","").replace('[','')
    text = re.sub(r'\b\w{1,1}\b', '', text)
    w = nltk.word_tokenize(text)
    w = [ele for ele in w if ele not in ignore_words]
    words2.extend(w)
    labeledlines2.append([w, line[1]])
    
testlines2 = []

for line in testset2:
    text = str(re.split(r'[.,]', line[0])).replace("'","").replace('[','')
    text = re.sub(r'\b\w{1,1}\b', '', text)
    w = nltk.word_tokenize(text)
    w = [ele for ele in w if ele not in ignore_words]
    words2.extend(w)
    testlines2.append([w, line[1]])
    

words2 = list(set(words2))



In [581]:
data2 = []

for line in labeledlines2:
    bag = []
    code = line[0]
    for w in words2:
        bag.append(code.count(w)) 
        
    classes = [0,0,0]
    if line[1] == 'aws':
        classes[0] = 1
    elif line[1] == 'watson':
        classes[1] = 1
    elif line[1] == 'azure':
        classes[2] = 1

    data2.append([bag,classes])
    
testdata2 = []
for line in testlines2:
    bag = []
    code = line[0]
    for w in words2:
        bag.append(code.count(w)) 
        
    classes = [0,0,0]
    if line[1] == 'aws':
        classes[0] = 1
    elif line[1] == 'watson':
        classes[1] = 1
    elif line[1] == 'azure':
        classes[2] = 1

    testdata2.append([bag,classes])

In [598]:
x_train = np.array([row[0] for row in data2]).T
y_train = np.array([row[1] for row in data2]).T
x_test = np.array([row[0] for row in testdata2]).T
y_test = np.array([row[1] for row in testdata2]).T

In [681]:
pls = {}
for row in range(len(trainset2)):
    if trainset2[row][1] not in pls:
        pls[trainset2[row][1]] = []
    pls[trainset2[row][1]].append(trainset2[row][0])

plprobs, plwordprobs = bayes_train(pls, len(trainset2))

In [696]:
plrobs

{'aws': 0.6862068965517242,
 'azure': 0.15689655172413794,
 'watson': 0.15689655172413794}

In [272]:
sorted(plwordprobs['watson'].items(), key=operator.itemgetter(1) ,reverse=True)[:25]


[('tf', 0.027677463818056514),
 ('df', 0.02263266712611992),
 ('train', 0.01386629910406616),
 ('data', 0.012598208132322536),
 ('import', 0.011936595451412818),
 ('test', 0.011853893866299104),
 ('print', 0.01155065472088215),
 ('in', 0.009317711922811854),
 ('for', 0.009235010337698139),
 ('the', 0.008766368022053756),
 ('model', 0.008656099241902136),
 ('cost', 0.008435561681598897),
 ('sess', 0.00647829083390765),
 ('to', 0.00647829083390765),
 ('from', 0.006340454858718126),
 ('shape', 0.005403170227429359),
 ('run', 0.005375603032391455),
 ('as', 0.005265334252239835),
 ('get', 0.005155065472088215),
 ('batch', 0.004824259131633356),
 ('if', 0.004796691936595451),
 ('and', 0.004769124741557547),
 ('self', 0.0047139903514817364),
 ('values', 0.004631288766368022),
 ('random', 0.004383184011026878)]

In [742]:
bayespred = bayes_test(plprobs,plwordprobs,testset2)

In [618]:
params = initialize_parameters_deep([len(words2),1024,512,64,3])

for i in range(0, 10000):
    AL, cache = forward_propagation_L(x_train, params)
    cost = cross_entropy(AL, y_train)
    grads = backward_propagation_L(params, cache, x_train, y_train)
    params = update_parameters(params, grads, .1)
    
    if cost and i % 100 == 0 and i < 1000 or i % 1000 == 0:
        print ("Cost after iteration %i: %f" % (i, cost))

Cost after iteration 0: 1.909213
Cost after iteration 100: 0.488878
Cost after iteration 200: 0.210986
Cost after iteration 300: 0.123227
Cost after iteration 400: 0.102383
Cost after iteration 500: 0.091373
Cost after iteration 600: 0.083705
Cost after iteration 700: 0.077960
Cost after iteration 800: 0.073648
Cost after iteration 900: 0.070461
Cost after iteration 1000: 0.068141
Cost after iteration 2000: 0.062132
Cost after iteration 3000: 0.061410
Cost after iteration 4000: 0.061163
Cost after iteration 5000: 0.061042
Cost after iteration 6000: 0.060970
Cost after iteration 7000: 0.060924
Cost after iteration 8000: 0.060891
Cost after iteration 9000: 0.060866


In [None]:
AL, cache = forward_propagation_L(x_train, params)
nnpred = []

for i in AL.T:
    if np.argmax(i) == 0:
        nnpred.append('aws')
    if np.argmax(i) == 1:
        nnpred.append('watson')
    if np.argmax(i) == 2:
        nnpred.append('azure')

In [329]:
from watson_developer_cloud import NaturalLanguageClassifierV1

natural_language_classifier = NaturalLanguageClassifierV1(
    username="YOUR USERNAME",
    password="YOUR PASSWORD")


In [391]:
d = []
for i in labeledlines2:
    for j in re.findall('.{1,1024}', str(i[0]).replace(',', ' ').replace("'",'').replace('[','').replace(']','')):
        d.append({'text': j, 'cloud': i[1]})
        
df = pd.DataFrame(d, columns = ['text', 'cloud'])
df['text'].replace(' ', np.nan, inplace=True)
df = df.dropna()
df.to_csv('newcloudtraindata.csv', header=['text','cloud'],index=False)

In [397]:
with open('newcloudtraindata.csv', 'rb') as training_data:
    natural_language_classifier.create_classifier(training_data=training_data, metadata='{"name": "new Cloud Classifier","language": "en"}')

In [None]:
natural_language_classifier.list_classifiers() #get your classifier

In [403]:
natural_language_classifier.get_classifier('YOUR CLASSIFIER ID').get_result()

{'classifier_id': 'f33041x451-nlc-2892',
 'created': '2018-10-15T23:09:57.475Z',
 'language': 'en',
 'name': 'new Cloud Classifier',
 'status': 'Available',
 'status_description': 'The classifier instance is now available and is ready to take classifier requests.',
 'url': 'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/f33041x451-nlc-2892'}

In [574]:
watsonpred = []
for i in testset2:
    x = natural_language_classifier.classify(,re.sub(' +',' '," ".join(re.split(r'[^\w]', re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,i[0]))))[0:1024])
    watsonpred.append(x.get_result()['top_class'])

In [747]:
print(compute_accuracy(bayespred,testset2))
print(compute_accuracy(nnpred,testset2))
print(compute_accuracy(watsonpred, testset2))

0.8344827586206897
0.8896551724137931
0.9448275862068966


In [627]:
caticc(nnpred,predictions,watsonpred)

0.9643151120180458