In [1]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import metrics
import string 
import spacy
import requests
import json

  from .autonotebook import tqdm as notebook_tqdm


## Custom word embedding using Transformers

In [2]:
import tensorflow as tf

In [3]:
df = pd.read_csv('Database_C.csv')
df

Unnamed: 0,code,error_check
0,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
1,"#include <stdio.h>\nint main(){\n int n,i,b...",0
2,#include <stdio.h>\n#include <stdlib.h>\n#incl...,0
3,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
4,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
...,...,...
53473,#include<stdio.h>\n#include<stdlib.h>\nint mai...,1
53474,#include<stdio.h>\n#include<stdlib.h>\nint mai...,1
53475,"#include<stdio.h>\nint main()\n{\n int n,k,...",1
53476,"#include<stdio.h>\nint main()\n{\n int n,k;...",1


In [4]:
def get_training_corpus():
    dataset = df[df['error_check']!=1]
    for start_idx in range(0, len(dataset), 300):
        samples = dataset[start_idx : start_idx + 300]
        yield samples["code"]

In [5]:
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [6]:
example = """#include <stdio.h>
#include <stdlib.h>
int main() {
	int i,siz1,siz2;
	scanf(""%d"",siz1);
	scanf(""%d"",siz2);
	char ch1[siz1];
	char ch2[siz2];
	for(i=0;i<siz1;i++)
	ch1[i]= getchar();
	for(i=0;i<siz1;i++)
	putchar(ch1[i]);
	return 0;
}"""

tokens = old_tokenizer.tokenize(example)
tokens

['#',
 'include',
 'Ġ<',
 'std',
 'io',
 '.',
 'h',
 '>',
 'Ċ',
 '#',
 'include',
 'Ġ<',
 'std',
 'lib',
 '.',
 'h',
 '>',
 'Ċ',
 'int',
 'Ġmain',
 '()',
 'Ġ{',
 'Ċ',
 'ĉ',
 'int',
 'Ġi',
 ',',
 's',
 'iz',
 '1',
 ',',
 's',
 'iz',
 '2',
 ';',
 'Ċ',
 'ĉ',
 'scan',
 'f',
 '("',
 '"',
 '%',
 'd',
 '"',
 '",',
 's',
 'iz',
 '1',
 ');',
 'Ċ',
 'ĉ',
 'scan',
 'f',
 '("',
 '"',
 '%',
 'd',
 '"',
 '",',
 's',
 'iz',
 '2',
 ');',
 'Ċ',
 'ĉ',
 'char',
 'Ġch',
 '1',
 '[',
 's',
 'iz',
 '1',
 '];',
 'Ċ',
 'ĉ',
 'char',
 'Ġch',
 '2',
 '[',
 's',
 'iz',
 '2',
 '];',
 'Ċ',
 'ĉ',
 'for',
 '(',
 'i',
 '=',
 '0',
 ';',
 'i',
 '<',
 's',
 'iz',
 '1',
 ';',
 'i',
 '++)',
 'Ċ',
 'ĉ',
 'ch',
 '1',
 '[',
 'i',
 ']=',
 'Ġget',
 'char',
 '();',
 'Ċ',
 'ĉ',
 'for',
 '(',
 'i',
 '=',
 '0',
 ';',
 'i',
 '<',
 's',
 'iz',
 '1',
 ';',
 'i',
 '++)',
 'Ċ',
 'ĉ',
 'put',
 'char',
 '(',
 'ch',
 '1',
 '[',
 'i',
 ']);',
 'Ċ',
 'ĉ',
 'return',
 'Ġ0',
 ';',
 'Ċ',
 '}']

In [7]:
training_corpus = get_training_corpus()

In [8]:
for x in training_corpus:
    print(x)

0      #include <stdio.h>\n#include <stdlib.h>\nint m...
1      #include <stdio.h>\nint main(){\n    int n,i,b...
2      #include <stdio.h>\n#include <stdlib.h>\n#incl...
3      #include <stdio.h>\n#include <stdlib.h>\nint m...
4      #include <stdio.h>\n#include <stdlib.h>\nint m...
                             ...                        
295    #include <stdio.h>\n#include <stdlib.h>\nint m...
296    #include <stdio.h>\nint main()\n{\n    int i,n...
297    #include <stdio.h>\nint main()\n{\n    int n ,...
298    #include <stdio.h>\n#include <stdlib.h>\nint d...
299    #include <stdio.h>\n#include <stdlib.h>\nint m...
Name: code, Length: 300, dtype: object
300    #include<stdio.h>\nint arr[100];\nint getInver...
301    #include <stdio.h>\r\nint main()\r\n{\r\n    i...
302    #include <stdio.h>\nint main()\n{\n    int h;\...
303    #include <stdio.h>\nint main()\n{\n    int n, ...
304    #include <stdio.h>\nint arr[100]={0};\nint n,k...
                             ...                 

In [9]:
for x in training_corpus:
    print(x)

In [10]:
# after printing training corpus data will be erased we run same code 
#two time once it will be displaced and another time it will be erased
# so we should declare training corpus again


In [11]:
training_corpus = get_training_corpus()

In [12]:
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 46499)

In [13]:
new_tokenizer.save_pretrained("shirshak-tokenizer")

('shirshak-tokenizer\\tokenizer_config.json',
 'shirshak-tokenizer\\special_tokens_map.json',
 'shirshak-tokenizer\\vocab.json',
 'shirshak-tokenizer\\merges.txt',
 'shirshak-tokenizer\\added_tokens.json',
 'shirshak-tokenizer\\tokenizer.json')

In [14]:
s_tokenizer = AutoTokenizer.from_pretrained("shirshak-tokenizer")

In [15]:
s_tokenizer

PreTrainedTokenizerFast(name_or_path='shirshak-tokenizer', vocab_size=11669, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [18]:
tokens = s_tokenizer.tokenize(example)
tokens

['#',
 'include',
 'Ġ<',
 'stdio',
 '.',
 'h',
 '>',
 'Ċ',
 '#',
 'include',
 'Ġ<',
 'stdlib',
 '.',
 'h',
 '>',
 'Ċ',
 'int',
 'Ġmain',
 '()',
 'Ġ{',
 'Ċ',
 'ĉ',
 'int',
 'Ġi',
 ',',
 'siz',
 '1',
 ',',
 'siz',
 '2',
 ';',
 'Ċ',
 'ĉ',
 'scanf',
 '("',
 '"%',
 'd',
 '"",',
 'siz',
 '1',
 ');',
 'Ċ',
 'ĉ',
 'scanf',
 '("',
 '"%',
 'd',
 '"",',
 'siz',
 '2',
 ');',
 'Ċ',
 'ĉ',
 'char',
 'Ġch',
 '1',
 '[',
 'siz',
 '1',
 '];',
 'Ċ',
 'ĉ',
 'char',
 'Ġch',
 '2',
 '[',
 'siz',
 '2',
 '];',
 'Ċ',
 'ĉ',
 'for',
 '(',
 'i',
 '=',
 '0',
 ';',
 'i',
 '<',
 'siz',
 '1',
 ';',
 'i',
 '++)',
 'Ċ',
 'ĉ',
 'ch',
 '1',
 '[',
 'i',
 ']=',
 'Ġgetchar',
 '();',
 'Ċ',
 'ĉ',
 'for',
 '(',
 'i',
 '=',
 '0',
 ';',
 'i',
 '<',
 'siz',
 '1',
 ';',
 'i',
 '++)',
 'Ċ',
 'ĉ',
 'putchar',
 '(',
 'ch',
 '1',
 '[',
 'i',
 ']);',
 'Ċ',
 'ĉ',
 'return',
 'Ġ0',
 ';',
 'Ċ',
 '}']

In [19]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

124
142


In [20]:
tokenized = s_tokenizer.tokenize(example)
tokenized

['#',
 'include',
 'Ġ<',
 'stdio',
 '.',
 'h',
 '>',
 'Ċ',
 '#',
 'include',
 'Ġ<',
 'stdlib',
 '.',
 'h',
 '>',
 'Ċ',
 'int',
 'Ġmain',
 '()',
 'Ġ{',
 'Ċ',
 'ĉ',
 'int',
 'Ġi',
 ',',
 'siz',
 '1',
 ',',
 'siz',
 '2',
 ';',
 'Ċ',
 'ĉ',
 'scanf',
 '("',
 '"%',
 'd',
 '"",',
 'siz',
 '1',
 ');',
 'Ċ',
 'ĉ',
 'scanf',
 '("',
 '"%',
 'd',
 '"",',
 'siz',
 '2',
 ');',
 'Ċ',
 'ĉ',
 'char',
 'Ġch',
 '1',
 '[',
 'siz',
 '1',
 '];',
 'Ċ',
 'ĉ',
 'char',
 'Ġch',
 '2',
 '[',
 'siz',
 '2',
 '];',
 'Ċ',
 'ĉ',
 'for',
 '(',
 'i',
 '=',
 '0',
 ';',
 'i',
 '<',
 'siz',
 '1',
 ';',
 'i',
 '++)',
 'Ċ',
 'ĉ',
 'ch',
 '1',
 '[',
 'i',
 ']=',
 'Ġgetchar',
 '();',
 'Ċ',
 'ĉ',
 'for',
 '(',
 'i',
 '=',
 '0',
 ';',
 'i',
 '<',
 'siz',
 '1',
 ';',
 'i',
 '++)',
 'Ċ',
 'ĉ',
 'putchar',
 '(',
 'ch',
 '1',
 '[',
 'i',
 ']);',
 'Ċ',
 'ĉ',
 'return',
 'Ġ0',
 ';',
 'Ċ',
 '}']

In [21]:
df_0 = df[df['error_check']!=1]
df_0

Unnamed: 0,code,error_check
0,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
1,"#include <stdio.h>\nint main(){\n int n,i,b...",0
2,#include <stdio.h>\n#include <stdlib.h>\n#incl...,0
3,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
4,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
...,...,...
46495,"#include <stdio.h>\nint main()\n{\n int n, ...",0
46496,#include <stdio.h>\n#include <stdlib.h>\nint B...,0
46497,#include<stdio.h>\nint arr[100];\nint max;\nin...,0
46498,#include <stdio.h>\nint a[100000];\nvoid swap(...,0


In [22]:
df_1 = df[df['error_check']!=0]
df_1

Unnamed: 0,code,error_check
46500,"#include <stdio.h>\nint main(){\n int k,n;\...",1
46501,"#include <stdio.h>\nint main()\n{\n int k,n...",1
46502,"#include <stdio.h>\nint main(){\n int k,n,i...",1
46503,"#include <stdio.h>\nint main(){\nint i,k,n,a;\...",1
46504,"#include <stdio.h>\nint main(){int i,c,b,d=0,a...",1
...,...,...
53473,#include<stdio.h>\n#include<stdlib.h>\nint mai...,1
53474,#include<stdio.h>\n#include<stdlib.h>\nint mai...,1
53475,"#include<stdio.h>\nint main()\n{\n int n,k,...",1
53476,"#include<stdio.h>\nint main()\n{\n int n,k;...",1


In [27]:
df_0.drop(index=df_0.index[100:], axis=0, inplace=True)
df_0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_0.drop(index=df_0.index[100:], axis=0, inplace=True)


Unnamed: 0,code,error_check
0,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
1,"#include <stdio.h>\nint main(){\n int n,i,b...",0
2,#include <stdio.h>\n#include <stdlib.h>\n#incl...,0
3,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
4,#include <stdio.h>\n#include <stdlib.h>\nint m...,0
...,...,...
95,#include <stdio.h>\nint arr[100]={0};\nint n;\...,0
96,"#include<stdio.h>\nint main()\n{int e,f,g;\nsc...",0
97,#include <stdio.h>\r\nint main()\r\n{\r\n i...,0
98,#include <stdio.h>\n#include <stdlib.h>\nint m...,0


In [28]:
df_1.drop(index=df_1.index[100:], axis=0, inplace=True)
df_1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1.drop(index=df_1.index[100:], axis=0, inplace=True)


Unnamed: 0,code,error_check
46500,"#include <stdio.h>\nint main(){\n int k,n;\...",1
46501,"#include <stdio.h>\nint main()\n{\n int k,n...",1
46502,"#include <stdio.h>\nint main(){\n int k,n,i...",1
46503,"#include <stdio.h>\nint main(){\nint i,k,n,a;\...",1
46504,"#include <stdio.h>\nint main(){int i,c,b,d=0,a...",1
...,...,...
46595,"#include <stdio.h>\n\nint main(){\n int n,m...",1
46596,#include <stdio.h>\n\nint main()\n{\n int n...,1
46597,#include <stdio.h>\n\nint main()\n{\n int ...,1
46598,#include <stdio.h>\n\nint main()\n{\n int n...,1


In [29]:
df_0.squeeze()
df_1.squeeze() 

Unnamed: 0,code,error_check
46500,"#include <stdio.h>\nint main(){\n int k,n;\...",1
46501,"#include <stdio.h>\nint main()\n{\n int k,n...",1
46502,"#include <stdio.h>\nint main(){\n int k,n,i...",1
46503,"#include <stdio.h>\nint main(){\nint i,k,n,a;\...",1
46504,"#include <stdio.h>\nint main(){int i,c,b,d=0,a...",1
...,...,...
46595,"#include <stdio.h>\n\nint main(){\n int n,m...",1
46596,#include <stdio.h>\n\nint main()\n{\n int n...,1
46597,#include <stdio.h>\n\nint main()\n{\n int ...,1
46598,#include <stdio.h>\n\nint main()\n{\n int n...,1


In [30]:
df_new = pd.concat([df_0,df_1])

In [31]:
df_new['error_check'].value_counts()

0    100
1    100
Name: error_check, dtype: int64

In [32]:
def my_tokenizer(code):
    tokens = s_tokenizer.tokenize(code)
    print(tokens)
    return tokens

In [33]:
tokens = my_tokenizer(example)

['#', 'include', 'Ġ<', 'stdio', '.', 'h', '>', 'Ċ', '#', 'include', 'Ġ<', 'stdlib', '.', 'h', '>', 'Ċ', 'int', 'Ġmain', '()', 'Ġ{', 'Ċ', 'ĉ', 'int', 'Ġi', ',', 'siz', '1', ',', 'siz', '2', ';', 'Ċ', 'ĉ', 'scanf', '("', '"%', 'd', '"",', 'siz', '1', ');', 'Ċ', 'ĉ', 'scanf', '("', '"%', 'd', '"",', 'siz', '2', ');', 'Ċ', 'ĉ', 'char', 'Ġch', '1', '[', 'siz', '1', '];', 'Ċ', 'ĉ', 'char', 'Ġch', '2', '[', 'siz', '2', '];', 'Ċ', 'ĉ', 'for', '(', 'i', '=', '0', ';', 'i', '<', 'siz', '1', ';', 'i', '++)', 'Ċ', 'ĉ', 'ch', '1', '[', 'i', ']=', 'Ġgetchar', '();', 'Ċ', 'ĉ', 'for', '(', 'i', '=', '0', ';', 'i', '<', 'siz', '1', ';', 'i', '++)', 'Ċ', 'ĉ', 'putchar', '(', 'ch', '1', '[', 'i', ']);', 'Ċ', 'ĉ', 'return', 'Ġ0', ';', 'Ċ', '}']


In [34]:
from sklearn.model_selection import train_test_split
X = df_new['code']
y = df_new['error_check']

In [35]:
X.shape,y.shape

((200,), (200,))

In [36]:
y.value_counts()

0    100
1    100
Name: error_check, dtype: int64

In [57]:
X_train,X_rem, y_train,y_rem = train_test_split(X,y,train_size=0.9,shuffle=True,stratify=y)

In [58]:
X_val,X_test, y_val,y_test = train_test_split(X_rem,y_rem,test_size=0.5,shuffle = True)

In [59]:
tfidf_vector = TfidfVectorizer(tokenizer=my_tokenizer)

In [62]:
X_train_vec = tfidf_vector.fit_transform(X_train)
X_val_vec = tfidf_vector.transform(X_val)
X_test_vec = tfidf_vector.transform(X_test)

['#', 'include', 'Ġ<', 'stdio', '.', 'h', '>', 'Ċ', 'int', 'Ġmain', '(){', 'ĊĠĠ', 'Ġint', 'Ġa', '[', '1000', '];', 'ĊĠĠĠ', 'Ġint', 'Ġi', ',', 'j', ',', 'n', ',', 'k', ',', 'm', '=', '0', ',', 'flag', '=', '0', ';', 'ĊĠĠĠ', 'Ġscanf', '("%', 'd', '",&', 'k', ');', 'ĊĠĠĠ', 'Ġscanf', '("%', 'd', '",&', 'n', ');', 'ĊĠĠĠ', 'Ġfor', 'Ġ(', 'i', '=', '0', ';', 'i', '<', 'n', ';', 'i', '++){', 'ĊĠĠĠĠĠĠĠ', 'Ġscanf', '("%', 'd', '",&', 'a', '[', 'i', ']);', 'ĊĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġ}', 'ĊĠĠĠ', 'Ġfor', 'Ġ(', 'i', '=', '0', ';', 'i', '<', 'n', ';', 'i', '++){', 'ĊĠĠĠĠĠĠĠ', 'Ġj', '=', 'k', '-', 'a', '[', 'i', '];', 'ĊĠĠĠĠĠĠĠ', 'Ġwhile', 'Ġ(', 'm', '<', 'n', '){', 'ĊĠĠĠĠĠĠĠĠĠĠĠ', 'Ġif', 'Ġ(', 'j', '==', 'a', '[', 'm', ']){', 'ĊĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġprintf', '("', 'lucky', '");', 'ĊĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġflag', '=', '1', ';', 'ĊĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġbreak', ';', 'ĊĠĠĠĠĠĠĠĠĠĠĠ', 'Ġ}', 'ĊĠĠĠĠĠĠĠĠĠĠĠ', 'Ġelse', 'Ġ{', 'flag', '==', '2', ';}', 'ĊĠĠĠĠĠĠĠĠĠĠĠ', 'Ġm', '++;', 'ĊĠĠĠĠĠĠĠ', 'Ġ}', 'ĊĠĠĠĠĠĠĠ', 'Ġif', 'Ġ(', 'flag'

In [63]:
X_train_vec.shape,X_val_vec.shape,X_test_vec.shape

((180, 946), (10, 946), (10, 946))

In [82]:
X_train_vec.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04093205, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [83]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=2000)

In [85]:
history = classifier.fit(X_train_vec,y_train)

In [86]:
predicted = classifier.predict(X_test_vec)

In [87]:
metrics.accuracy_score(y_test,predicted)

0.9

In [88]:
metrics.precision_score(y_test, predicted)

0.8333333333333334

In [89]:
metrics.recall_score(y_test, predicted)

1.0

# USING LSTM

In [65]:
import tensorflow as tf

In [66]:
X_train_tf = tf.constant(X_train_vec.toarray())
X_val_tf = tf.constant(X_val_vec.toarray())
X_test_tf = tf.constant(X_test_vec.toarray())

In [67]:
X_train_tf.shape,X_val_tf.shape,X_test_tf.shape

(TensorShape([180, 946]), TensorShape([10, 946]), TensorShape([10, 946]))

In [68]:
y_train_tf = tf.constant(y_train)
y_val_tf = tf.constant(y_val)
y_test_tf = tf.constant(y_test)

In [69]:
y_train_tf.shape

TensorShape([180])

In [70]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(32,input_shape=(None,1)),
    tf.keras.layers.Dense(1,activation='sigmoid')]
)

In [71]:
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4), metrics=['accuracy'])

In [73]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 32)                4352      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 4,385
Trainable params: 4,385
Non-trainable params: 0
_________________________________________________________________


In [76]:
history = model.fit(X_train_tf,y_train_tf,batch_size=64,epochs=10,validation_data=(X_val_tf,y_val_tf))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [77]:
predicted = model.predict(X_test_tf)

In [79]:
from sklearn.metrics import accuracy_score,precision_score,recall_score
accuracy_score(y_test_tf,predicted.round())

0.5

In [90]:
#Tensorflow implementation

In [92]:
Model_tf = tf.keras.Sequential([
    tf.keras.layers.Dense(73, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(700, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss=tf.keras.losses.binary_crossentropy,
    optimizer=tf.keras.optimizers.Adam(lr=0.03),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

history = model.fit(X_train_tf, y_train_tf, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [93]:
predictions = model.predict(X_test_tf)

In [97]:
metrics.accuracy_score(y_test, predictions.round())

0.5

In [99]:
pad_token = '<PAD>'
if s_tokenizer.pad_token is None:
    s_tokenizer.add_special_tokens({'pad_token': pad_token})

In [101]:
bert_input = s_tokenizer.encode_plus(
                        example,                      
                        add_special_tokens = True, # add [CLS], [SEP]
                        max_length = 300, # max length of the text that can go to BERT
                        padding='longest', # add [PAD] tokens
                        return_attention_mask = True, # add attention mask to not focus on pad tokens
              )


In [102]:
print('encoded', bert_input)

encoded {'input_ids': [3, 153, 155, 177, 14, 72, 30, 101, 3, 153, 155, 196, 14, 72, 30, 101, 114, 176, 175, 123, 101, 100, 114, 118, 12, 1499, 17, 12, 1499, 18, 27, 101, 100, 197, 117, 1578, 68, 6743, 1499, 17, 116, 101, 100, 197, 117, 1578, 68, 6743, 1499, 18, 116, 101, 100, 311, 364, 17, 59, 1499, 17, 172, 101, 100, 311, 364, 18, 59, 1499, 18, 172, 101, 100, 131, 8, 73, 29, 16, 27, 73, 28, 1499, 17, 27, 73, 135, 101, 100, 251, 17, 59, 73, 193, 1960, 744, 101, 100, 131, 8, 73, 29, 16, 27, 73, 28, 1499, 17, 27, 73, 135, 101, 100, 5177, 8, 251, 17, 59, 73, 183, 101, 100, 146, 161, 27, 101, 93], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [115]:
from transformers import GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(s_tokenizer),
    n_ctx=128,
    bos_token_id=s_tokenizer.bos_token_id,
    eos_token_id=s_tokenizer.eos_token_id,
    output_attentions=True
)

In [116]:
model = GPT2LMHeadModel(config= config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 94.8M parameters


In [117]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [118]:
learning_rate = 2e-5

# optimizer Adam
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

# Tokenize the input (takes some time)
x_train = s_tokenizer(
          text=X_train.to_list(),
          add_special_tokens=True,
          max_length=300,
          truncation=False,
          padding=True, 
          return_tensors='tf',
          return_token_type_ids = False,
          return_attention_mask = True,
          verbose = True)

x_val = s_tokenizer(
          text=X_test.to_list(),
          add_special_tokens=True,
          max_length=300,
          truncation=True,
          padding=True, 
          return_tensors='tf',
          return_token_type_ids = False,
          return_attention_mask = True,
          verbose = True)



In [119]:
# Fit the model
history = model.fit(
    x={'input_ids': x_train['input_ids']},
    y={'error_check': y_train},
    validation_data=({'input_ids': x_val['input_ids']},{'error_check': y_test}),
    batch_size=100,
    epochs=2,
    verbose=1)

Epoch 1/2


ValueError: in user code:

    File "C:\Users\Hp\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Hp\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Hp\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Hp\anaconda3\lib\site-packages\keras\engine\training.py", line 864, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "C:\Users\Hp\anaconda3\lib\site-packages\keras\engine\training.py", line 957, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "C:\Users\Hp\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 438, in update_state
        self.build(y_pred, y_true)
    File "C:\Users\Hp\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 358, in build
        self._metrics = tf.__internal__.nest.map_structure_up_to(y_pred, self._get_metric_objects,

    ValueError: The two structures don't have the same sequence length. Input structure has length 2, while shallow structure has length 1.


In [None]:
model_eval = model.evaluate(
    x={'input_ids': x_val['input_ids']},
    y={'error_check': y_test}
)

In [None]:
y_val_predicted = model.predict(
    x={'input_ids': x_val['input_ids']},
)

In [None]:
y_val_pred_max=[np.argmax(i) for i in y_val_predicted['error_check']]

In [None]:
y_val_actual_max=[np.argmax(i) for i in y_val]

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_val_pred_max, y_val_actual_max)
print(report)