In [None]:
!pip install pandas numpy tqdm scikit-learn tensorflow tokenizers transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip list

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import  matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from tokenizers import BertWordPieceTokenizer

In [None]:
data = pd.read_csv('tun.xlsx')

In [None]:
data


In [None]:
EPOCHS = 10
BATCH_SIZE = 100
MAX_LEN = 192
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
def encode_text(texts, tokenizer, chunk_size=256, maxlen=512):

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
fast_tokenizer = BertWordPieceTokenizer('/content/vocab.txt', lowercase=False)

In [None]:
# from keras.optimizers_v1 import Adam
# from tensorflow.compat.v1.keras import Adam
from tensorflow.keras.optimizers.legacy import Adam

def build_model(transformer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(tf.keras.optimizers.legacy.Adam(lr=1e-6), loss='binary_crossentropy', metrics=['accuracy','AUC'])
    
    return model

In [None]:
texts = encode_text(data.text.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)
ys = data.intent.values

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def create_train(x_train,y_train) :
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    return  train_dataset

def create_valid(x_valid,y_valid) :
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(AUTO)
    )
    
    return valid_dataset

def create_test(x_test) :
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(BATCH_SIZE)
    )
    return test_dataset

In [None]:
FOLDS = 5
SEED  = 42
transformer_layer = (transformers.TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased'))
model = build_model(transformer_layer, max_len=MAX_LEN)
skf = KFold(n_splits=FOLDS,shuffle=True,random_state=SEED)

for fold,(train_indices,valid_indices) in enumerate(skf.split(texts,ys)) :
    print('Fold' , fold+1)
    sv = tf.keras.callbacks.ModelCheckpoint(
        'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
        save_weights_only=True, mode='min', save_freq='epoch')
    
    n_steps = train_indices.shape[0]
    history = model.fit(
    create_train(texts[train_indices],ys[train_indices]),
    steps_per_epoch=n_steps,
    validation_data=create_valid(texts[valid_indices],ys[valid_indices]),
    epochs=EPOCHS,
    callbacks =  [sv]    
    )
    
    
    plt.figure(figsize=(15,5))
    plt.plot(np.arange(EPOCHS),history.history['auc'],'-o',label='Train AUC',color='#ff7f0e')
    plt.plot(np.arange(EPOCHS),history.history['val_auc'],'-o',label='Val AUC',color='#1f77b4')
    x = np.argmax( history.history['val_auc'] ); y = np.max( history.history['val_auc'] )
    xdist = plt.xlim()[1] - plt.xlim()[0]; ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#1f77b4'); plt.text(x-0.03*xdist,y-0.13*ydist,'max auc\n%.2f'%y,size=14)
    plt.ylabel('AUC',size=14); plt.xlabel('Epoch',size=14)
    plt.legend(loc=2)
    plt2 = plt.gca().twinx()
    plt2.plot(np.arange(EPOCHS),history.history['loss'],'-o',label='Train Loss',color='#2ca02c')
    plt2.plot(np.arange(EPOCHS),history.history['val_loss'],'-o',label='Val Loss',color='#d62728')
    x = np.argmin( history.history['val_loss'] ); y = np.min( history.history['val_loss'] )
    ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#d62728'); plt.text(x-0.03*xdist,y+0.05*ydist,'min loss',size=14)
    plt.ylabel('Loss',size=14)
    plt.title('FOLD %i Distilbert-base-multilingual-cased'%
                (fold+1),size=18)
    plt.legend(loc=3)
    plt.show()  

### **Tests:**

In [None]:
test = pd.read_csv('../input/vneuron/extra_test_data.csv')

test_texts = encode_text(test.text.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)

test_ys = test.intent.values

In [None]:
results = model.predict(create_test(test_texts))
for i,result in enumerate(results) :
    if result > 0.5 :
        results[i] = 1
    else :
        results[i] = 0

In [None]:
confusion_matrix(test_ys, results)

In [None]:
print(classification_report(test_ys, results))

In [None]:
all_ids = []
encs = fast_tokenizer.encode_batch(['Yaatikom el saha','Wena mechi fel tari9','service khayeb'])
all_ids.extend([enc.ids for enc in encs])
test_data = create_test(np.array(all_ids))
predictions = model.predict(test_data)
for prediction in predictions :
    print(prediction)