In [11]:
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import text_dataset_from_directory

In [3]:
train_dir = '/home/devcontainers/Datasets/train'
test_dir = '/home/devcontainers/Datasets/test'
os.listdir(train_dir)

['csharp', 'python', 'javascript', 'java']

In [14]:
# So, we have 4 classes. Let's see what we have inside python class

python_dir = os.path.join(train_dir, 'python')
python_txt = os.listdir(python_dir)
python_txt = python_txt[:5]
for txt in python_txt:
    with open(os.path.join(python_dir, txt), 'r') as f:
        print(f.read())

"blank - install modules from source i downloaded a repo from github and installed it in a virtualenv using $blank setup.py install in the project dir...this works fine. now, when i open a file and induce an error, (like changing return to lol) it still installs fine. what is going on ? i wished to assert that my changes do not break the module and was hoping to install each time i make a change."

"how do i run an executable, then continue after it stops? what i'm wanting to do is save files written by the executable i'm running to a dropbox folder. the files are saved at random times, so i can't use a timer...the only ways i know of executing files are non-blocking, and when i run the functions, they simply just execute and continue (for example, using subprocess to run a start command with the argument being the executable)...i can't modify the executable, i can only run it...here's what i've tried so far: i have looked up the question and haven't found anything. the only way i know

In [57]:
# defining hyperparameters

batch_size = 32
seed = 42
maxLen = 300
vocab_size = 10000
embedding_dim = 50

In [58]:
train_ds_raw = text_dataset_from_directory(train_dir, batch_size=batch_size, validation_split=0.2, subset='training', seed=seed, label_mode='categorical')
val_ds_raw = text_dataset_from_directory(train_dir, batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed, label_mode='categorical')
test_ds_raw = text_dataset_from_directory(test_dir, batch_size=batch_size, label_mode='categorical')

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


In [59]:
# As the text contains html tags and punctuation which are not much needed to build our vocabulary, so we will remove them

def standardize_texts(data):
    lowercase = tf.strings.lower(data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

vectorize_layer = layers.TextVectorization(standardize=standardize_texts, max_tokens=vocab_size, output_mode='int', output_sequence_length=maxLen)

In [60]:
# Build the vectorization layer using texts from the training set 
train_texts = train_ds_raw.map(lambda x, y : x)
vectorize_layer.adapt(train_texts)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# Applying the vectorization layer to each text in dataset
train_ds = train_ds_raw.map(vectorize_text)
val_ds = val_ds_raw.map(vectorize_text)
test_ds = test_ds_raw.map(vectorize_text)

In [49]:
print(vectorize_layer.get_vocabulary()[9])
print(vectorize_layer.get_vocabulary()[99])
print(vectorize_layer.get_vocabulary()[999])
print(vectorize_layer.get_vocabulary()[9999])

of
user
taking
essential


In [61]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [62]:
input = layers.Input(shape=(maxLen,))
emb = layers.Embedding(vocab_size, embedding_dim)(input)
x = layers.Dropout(0.2)(emb)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(256, activation='relu')(x)
output = layers.Dense(4, activation='softmax')(x)
model = tf.keras.Model(input, output)

In [63]:
model.summary()

In [64]:
model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.metrics.CategoricalAccuracy()])
checkpoint = tf.keras.callbacks.ModelCheckpoint('text_classifier_stack_overflow.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
history = model.fit(train_ds, batch_size=batch_size, epochs=10, validation_data=val_ds, callbacks=[checkpoint])

Epoch 1/10





[1m195/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - categorical_accuracy: 0.2878 - loss: 1.3801
Epoch 1: val_loss improved from inf to 1.25539, saving model to text_classifier_stack_overflow.h5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - categorical_accuracy: 0.2886 - loss: 1.3795 - val_categorical_accuracy: 0.3919 - val_loss: 1.2554
Epoch 2/10
[1m197/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.4689 - loss: 1.1874
Epoch 2: val_loss improved from 1.25539 to 0.99912, saving model to text_classifier_stack_overflow.h5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - categorical_accuracy: 0.4698 - loss: 1.1858 - val_categorical_accuracy: 0.5294 - val_loss: 0.9991
Epoch 3/10
[1m196/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.6054 - loss: 0.9169
Epoch 3: val_loss improved from 0.99912 to 0.83476, saving model to text_classifier_stack_overflow.h5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.6062 - loss: 0.9156 - val_categorical_accuracy: 0.6269 - val_loss: 0.8348
Epoch 4/10
[1m199/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.6900 - loss: 0.7603
Epoch 4: val_loss improved from 0.83476 to 0.79109, saving model to text_classifier_stack_overflow.h5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.6901 - loss: 0.7600 - val_categorical_accuracy: 0.6612 - val_loss: 0.7911
Epoch 5/10
[1m199/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - categorical_accuracy: 0.7271 - loss: 0.6715
Epoch 5: val_loss improved from 0.79109 to 0.72878, saving model to text_classifier_stack_overflow.h5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.7273 - loss: 0.6712 - val_categorical_accuracy: 0.6931 - val_loss: 0.7288
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.7554 - loss: 0.6094
Epoch 6: val_loss did not improve from 0.72878
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.7555 - loss: 0.6093 - val_categorical_accuracy: 0.6963 - val_loss: 0.7511
Epoch 7/10
[1m198/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.7814 - loss: 0.5580
Epoch 7: val_loss improved from 0.72878 to 0.61963, saving model to text_classifier_stack_overflow.h5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - categorical_accuracy: 0.7816 - loss: 0.5575 - val_categorical_accuracy: 0.7462 - val_loss: 0.6196
Epoch 8/10
[1m196/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.8132 - loss: 0.4902
Epoch 8: val_loss did not improve from 0.61963
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.8134 - loss: 0.4897 - val_categorical_accuracy: 0.7525 - val_loss: 0.6328
Epoch 9/10
[1m197/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - categorical_accuracy: 0.8292 - loss: 0.4527
Epoch 9: val_loss improved from 0.61963 to 0.58766, saving model to text_classifier_stack_overflow.h5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - categorical_accuracy: 0.8295 - loss: 0.4521 - val_categorical_accuracy: 0.7725 - val_loss: 0.5877
Epoch 10/10
[1m194/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - categorical_accuracy: 0.8462 - loss: 0.4029
Epoch 10: val_loss did not improve from 0.58766
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.8466 - loss: 0.4020 - val_categorical_accuracy: 0.7744 - val_loss: 0.6046


In [65]:
loss, accuracy = model.evaluate(test_ds)
print(loss)
print(accuracy)

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - categorical_accuracy: 0.7452 - loss: 0.6723
0.6696485280990601
0.7484999895095825
