## ODSC Workshop Part 1 - Data

In [None]:
import re
import pickle
import zipfile
import requests
import numpy as np
import tensorflow as tf

In [None]:
#make sure tensorflow is version 2.0.0
print(tf.__version__)

#### Download examples of TensorFlow and Pytorch code in the cell below. An example is printed

In [None]:
url = 'https://github.com/PubChimps/ODSC-Europe-Workshop/blob/master/data/dlzip.npz.zip?raw=true'
r = requests.get(url)
open('./dlzip.npz.zip', 'wb').write(r.content)
zippedfile = zipfile.ZipFile('./dlzip.npz.zip')
zippedfile.extractall()
dataset = np.load('dlzip.npz', allow_pickle = True)
dataset = dataset.f.arr_0

print(dataset[100])

#### Here we will strip the code of stopwords and encode the labels, the previous example is reprinted

In [None]:
code = dataset[:,0]
stopwords = ['tf', 'the', 'torch', 'keras', 'tensor', 'tensorflow', 'pytorch']
for i in range(len(code)):
    code [i] = re.sub(r'\b\w{1,1}\b', '', code[i])
    for word in stopwords:
        if word in code[i]:
            code[i] = code[i].replace(word,'')
            
labels = []
for example in dataset:
    if example[1] == 'tensorflow':
        labels.append(1)
    else:
        labels.append(0)
        
print(code[100],'\n',labels[100])

#### TensorFlow 2.0's Keras library has function to easily tokenize and encode data for a neural network

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=54483)
tokenizer.fit_on_texts(code)
vocab_size = len(tokenizer.word_index) + 1 
maxlen = 5000

code_train = code[:9180]
code_test = code[9180:]

X_train = tokenizer.texts_to_sequences(code_train)
X_test = tokenizer.texts_to_sequences(code_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)
y_train = np.array(labels[:9180]).reshape(9180,1)
y_test = np.array(labels[9180:]).reshape(2295,1)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

##### Keras' tokenizer indexes the code samples. In the example below you can see the first element's first 20 attributes in the tokenized data is a 7 and the first word in the example is 'import,' which is indexed to a 7 in tokenizer.word_index

In [None]:
print(X_train[100][:20],'\n',y_train[100])

In [None]:
print(code[100], '\n\n',tokenizer.word_index['import'], '\n',tokenizer.word_index['flow'])

##### Save the preprocessed data for part 2!

In [None]:
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)

np.save('code.npy', code)
np.save('labels.npy', labels)