In [None]:
import re
import sys
import json
import nltk
import base64
import github
import zipfile
import operator
import requests
import numpy as np
from keras import layers
from timeit import Timer
from collections import *
from keras.optimizers import SGD
from github import GithubException
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
print(tf.__version__) #this project using TensorFlow version 2.0.0-beta1

In [None]:
targetlangs = ['.ipynb']
def parsenb(dataset):
    parseddataset = []
   
    for i in range(len(dataset)):
        code = ''
        try:
            parsednb = (json.loads(dataset[i][1]))
        except:
            print(str(i) + ' ' + str(sys.exc_info()))
        for j in range(len(parsednb['cells'])):
            if parsednb['cells'][j]['cell_type'] == 'code':
                code = code + ''.join(parsednb['cells'][j]['source'])
        parseddataset.append([code, dataset[i][2]])
    
    for i in range(len(parseddataset)):
        parseddataset[i][0] = re.sub('[^a-zA-Z0-9 \n\.]', ' ', parseddataset[i][0]).replace('\n', ' ')
    return parseddataset

def download_directory(repository, path, framework):
    global dataset
    try:
        contents = repository.get_contents(path)
        for content in contents:
            if content.type == 'dir':
                download_directory(repository, content.path, framework)
            else:
                if content.content:
                    if len(str(content.name).split(".")) == 2:
                        if any(substring == ("." + str(content.name).split(".")[1]) for substring in targetlangs):                           
                            try:
                                dataset.append([repository,str(base64.b64decode(content.content),'utf-8'),framework])
                            except (GithubException, IOError) as exc:
                                print('Error processing %s: %s', content.path, exc)
    except (GithubException, IOError) as exc:
        print("error in dir ")
        
def fleiss_kappa(lists, classes):
    n = len(lists)
    N = len(lists[0])
    k = len(classes)
    
    nij = np.zeros([N,k])
        
    for i in range(len(lists)):
        for j in range(len(lists[i])):
            nij[j][classes.index(lists[i][j])] += 1 
    
    p = np.sum(nij, axis=0) / (N * n)
    P = (np.sum(nij * nij, axis=1) - n) / (n * (n - 1))
    Pbar = np.sum(P) / N
    PbarE = np.sum(p * p)

    return (Pbar - PbarE) / (1 - PbarE)

## Part 1 - Collect Dataset

In [None]:
#authenticate github
login = '****'
password = '****'

g = github.Github(login,password)

#get list of repos referecencing deep learning frameworks
tensorflowrepos = list(g.search_repositories('tensorflow language:"Jupyter Notebook" created:"2019-01-01 .. 2019-01-31"'))
pytorchrepos = list(g.search_repositories('pytorch language:"Jupyter Notebook" created:"2019-01-01 .. 2019-01-31"'))

#github limits the number of returns for a search call, 
#add to list of repos by making another call in a different 'created:' range
#tensorflowrepos += list(g.search_repositories('tensorflow language:"Jupyter Notebook" created:"2019-02-01 .. 2019-02-28"'))

#label repos
repolist = []
for i in pytorchrepos:
    username,reponame = str(i).split('"')[1].split('/')
    user = g.get_user(username)
    repolist.append([user.get_repo(reponame),'pytorch'])
for i in tensorflowrepos:
    username,reponame = str(i).split('"')[1].split('/')
    user = g.get_user(username)
    repolist.append([user.get_repo(reponame),'tensorflow'])
    
dataset = []

In [None]:
#download notebooks, check limit with g.get_rate_limit() and rerun when available
last = 0
for i in range(len(repolist[last:])):
    last = last + i
    download_directory(repolist[i][0],'',repolist[i][1])

## Part 1* - Load Dataset


In [None]:
#if you'd prefer to load a dataset rather than build one, run this cell
url = 'https://github.com/PubChimps/dlclassifier/blob/master/dlzip.npz.zip?raw=true'
r = requests.get(url)
open('./dlzip.npz.zip', 'wb').write(r.content)
zippedfile = zipfile.ZipFile('./dlzip.npz.zip')
zippedfile.extractall()
dataset = np.load('dlzip.npz', allow_pickle = True)
dataset = dataset.f.arr_0

## Part 2 - Data Preprocessing

### 2.1 - Bag of Words 

In [None]:
words = []

labeledlines = []
ignore_words = ['?', ',', 'tensorflow', 'tf', 'TensorFlow', 'pytorch']


for line in dataset:
    text = str(re.split(r'[.,]', line[0])).replace("'","").replace('[','')
    text = re.sub(r'\b\w{1,1}\b', '', text)
    w = nltk.word_tokenize(text)
    w = [ele for ele in w if ele not in ignore_words]
    words.extend(w)
    labeledlines.append([w, line[1]])
    
words = list(set(words))

data = []
for line in range(len(dataset)):
    #this loop takes a while (>20 minutes)
    #track progress by uncommenting the line below and comparing it to len(dataset)
    #print(line)
    bag = []
    code = dataset[line][0]
    for w in words:
        bag.append(code.count(w)) 
        
    classes = [0]
    if labeledlines[line][1] == 'tensorflow':
        classes[0] = 1
    elif labeledlines[line][1] == 'pytorch':
        classes[0] = 0

    data.append([bag,classes])
    
trainset = data[:9180]
testset = data[9180:]
x_train = np.array([row[0] for row in trainset])
y_train = np.array([row[1] for row in trainset])
x_test = np.array([row[0] for row in testset])
y_test = np.array([row[1] for row in testset])

## Part 3 - Train Models

In [None]:
model = Sequential()
model.add(layers.Dense(1024, activation='relu', input_dim= len(words)))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=500,
          epochs=20,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)

### Part 2.2 - Embeddings

In [None]:
code = dataset[:,0]
stopwords = ['tf', 'the', 'torch', 'keras', 'tensor', 'tensorflow']
for i in range(len(code)):
    code [i] = re.sub(r'\b\w{1,1}\b', '', code[i])
    for word in stopwords:
        if word in code[i]:
            code[i] = code[i].replace(word,'')

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(code)
vocab_size = len(tokenizer.word_index) + 1 
maxlen = 1000

In [None]:
tokenizer.word_index

In [None]:
code_train = code[:9180]
code_test = code[9180:]

X_train = tokenizer.texts_to_sequences(code_train)
X_test = tokenizer.texts_to_sequences(code_test)
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
embedding_dim = 100

embmodel = Sequential()
embmodel.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
embmodel.add(layers.Flatten())
embmodel.add(layers.Dense(10, activation='relu'))
embmodel.add(layers.Dense(1, activation='sigmoid'))
embmodel.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

embmodel.fit(X_train, y_train,
                    epochs=20,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = embmodel.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = embmodel.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
lenetmodel = Sequential()
lenetmodel.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))

lenetmodel.add(layers.Conv1D(filters=6, kernel_size=(3), activation='relu'))
lenetmodel.add(layers.AveragePooling1D())

lenetmodel.add(layers.Conv1D(filters=16, kernel_size=(3), activation='relu'))
lenetmodel.add(layers.AveragePooling1D())

lenetmodel.add(layers.Flatten())

lenetmodel.add(layers.Dense(units=120, activation='relu'))

lenetmodel.add(layers.Dense(units=84, activation='relu'))

lenetmodel.add(layers.Dense(units=1, activation = 'sigmoid'))
lenetmodel.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

lenetmodel.fit(X_train, y_train,
                    epochs=20,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = lenetmodel.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lenetmodel.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
vgg = Sequential()
vgg.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))

vgg.add(layers.Conv1D(filters=64, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.Conv1D(filters=64, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.MaxPooling1D(pool_size=2, strides=2))

vgg.add(layers.Conv1D(filters=128, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.Conv1D(filters=128, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.MaxPooling1D(pool_size=2, strides=2))

vgg.add(layers.Conv1D(filters=256, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.Conv1D(filters=256, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.MaxPooling1D(pool_size=2, strides=2))

vgg.add(layers.Conv1D(filters=512, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.Conv1D(filters=512, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.MaxPooling1D(pool_size=2, strides=2))

vgg.add(layers.Conv1D(filters=512, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.Conv1D(filters=512, kernel_size=(3), activation='relu', padding='same'))
vgg.add(layers.MaxPooling1D(pool_size=2, strides=2))

vgg.add(layers.Flatten())

vgg.add(layers.Dense(units=120, activation='relu'))

vgg.add(layers.Dense(units=84, activation='relu'))

vgg.add(layers.Dense(units=1, activation = 'sigmoid'))
vgg.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

vgg.fit(X_train, y_train,
                    epochs=20,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = vgg.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = vgg.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
t = Timer(lambda: mlpmodel.fit(x_train, y_train,
                    epochs=1,
                    verbose=False,
                    validation_data=(x_test, y_test),
                    batch_size=10))

print('NN number of parameters ' + str(mlpmodel.count_params()))
loss, mlpaccuracy = mlpmodel.evaluate(x_train, y_train, verbose=False)
print("NN Training Accuracy: {:.4f}".format(mlpaccuracy))
loss, mlptestaccuracy = mlpmodel.evaluate(x_test, y_test, verbose=False)
print("NN Testing Accuracy:  {:.4f}".format(mlptestaccuracy))
print('NN epoch training time ' + str(t.timeit(number=1)) + '\n\n')
print()
mlppreds = mlpmodel.predict_classes(x_test)

t = Timer(lambda: embmodel.fit(X_train, y_train,
                    epochs=1,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10))

print('Embedded Model number of parameters ' + str(embmodel.count_params()))
loss, emdtrainacc = embmodel.evaluate(X_train, y_train, verbose=False)
print("Embedded Model Training Accuracy: {:.4f}".format(emdtrainacc))
loss, emdtestacc = embmodel.evaluate(X_test, y_test, verbose=False)
print("Embedded Model Testing Accuracy:  {:.4f}".format(emdtestacc))
print('Embedded Model epoch training time ' + str(t.timeit(number=1)) + '\n\n')

emdpreds = embmodel.predict_classes(X_test)

t = Timer(lambda: lenetmodel.fit(X_train, y_train,
                    epochs=1,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10))
print('Text CNN number of parameters ' + str(lenetmodel.count_params()))
loss, cnntrainaccuracy = lenetmodel.evaluate(X_train, y_train, verbose=False)
print("Text CNN Training Accuracy: {:.4f}".format(cnntrainaccuracy))
loss, cnntestaccuracy = lenetmodel.evaluate(X_test, y_test, verbose=False)
print("Text CNN Testing Accuracy:  {:.4f}".format(cnntestaccuracy))
print('Text CNN training time ' + str(t.timeit(number=1)) + '\n\n')

textcnnpreds = lenetmodel.predict_classes(X_test)

t = Timer(lambda: vgg.fit(X_train, y_train,
                    epochs=1,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10))
print('Text VGG number of parameters ' + str(vgg.count_params()))
loss, vggtrainaccuracy = vgg.evaluate(X_train, y_train, verbose=False)
print("Text VGG Training Accuracy: {:.4f}".format(vggtrainaccuracy))
loss, vggtestaccuracy = vgg.evaluate(X_test, y_test, verbose=False)
print("Text VGG Testing Accuracy:  {:.4f}".format(vggtestaccuracy))
print('Text VGG training time ' + str(t.timeit(number=1)) + '\n\n')
vggpreds = vgg.predict_classes(X_test)


preds = [mlppreds, emdpreds, textcnnpreds, vggpreds ]
print("Fleiss' Kappa " + str(fleiss_kappa(preds, [0,1])))

In [None]:
print("Fleiss' Kappa " + str(fleiss_kappa(preds, [0,1])))