In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('./src')

In [None]:
import sys
import pandas as pd
import tensorflow as tf
from transformers import ConvBertTokenizer, TFConvBertModel

from model import bert_bigru_cnn_model, bert_bilstm_attention_model, bert_bilstm_model
from train_model import train_model, fetch_results
from test import *
from data import create_tensor_dataset, create_test_dataset
from text_cleaning import clean_text
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
import gc

## Veri Yükleme ve Modele Hazırlama

In [None]:
TEXT_COLUMN = 'number_to_text'
LABEL_COLUMN = 'label'
MAX_LENGTH = 32
BATCH_SIZE = 256 # 128
MODEL_NAME = 'dbmdz/convbert-base-turkish-mc4-uncased'
EPOCHS = 20

In [None]:
train_df = pd.read_csv('./data/augmented_train.csv')
valid_df = pd.read_csv('./data/cleaned_valid_df.csv')
test_df = pd.read_csv('./data/cleaned_test_df.csv')

In [None]:
# text preprocessing -- default: 'text' column
train_df = clean_text(train_df)
valid_df = clean_text(valid_df)
test_df = clean_text(test_df)

In [None]:
# VERİ ARTIRIMI (DATA AUGMENTATION) SONRASI OLUŞABİLECEK DUPLICATE SORUNUNU GİDERME
def remove_common_rows(df1, df2):
	common_train_valid = set(df1[TEXT_COLUMN]).intersection(set(df2[TEXT_COLUMN].values.tolist()))
	ix = df1[df1[TEXT_COLUMN].isin(common_train_valid)].index
	df1 = df1.drop(ix)
	return df1

train_df = remove_common_rows(train_df, valid_df)
train_df = remove_common_rows(train_df, test_df)
valid_df = remove_common_rows(valid_df, test_df)

# drop duplicates
train_df = train_df.drop_duplicates()
valid_df = valid_df.drop_duplicates()
test_df = test_df.drop_duplicates()

#### CREATE BATCH DATASET

In [None]:
# get tokenizer
tokenizer = ConvBertTokenizer.from_pretrained(MODEL_NAME)

# prepare train, valid and test tensor datasets from data.py
train_dataset, dev_dataset = create_tensor_dataset(train_df, valid_df, tokenizer, 'number_to_text', 'label', max_length=MAX_LENGTH)

# batches or not --> batch dataset: test_dataset
test_dataset, test_encoded = create_test_dataset(test_df.number_to_text.values, tokenizer, max_length=MAX_LENGTH)

## TRAINING AND TESTING

In [None]:
# load convbert-base-base-turkish-mc4-uncased model
bert_encoder = TFConvBertModel.from_pretrained(MODEL_NAME)

In [None]:
# CREATE 3 DIFFERENT MODELS
bert_bilstm_net = bert_bilstm_model(bert_encoder, max_length=MAX_LENGTH)
bert_bigru_cnn_net = bert_bigru_cnn_model(bert_encoder, max_length=MAX_LENGTH)
bert_bilstm_attention_net = bert_bilstm_attention_model(bert_encoder, max_length=MAX_LENGTH)

In [None]:
# TRAINING A MODEL
train_model(bert_bilstm_net, train_dataset, dev_dataset, batch_size=BATCH_SIZE, epochs=EPOCHS)
#train_model(bert_bigru_cnn_net, train_dataset, dev_dataset, batch_size=BATCH_SIZE, epochs=EPOCHS)
#train_model(bert_bilstm_attention_net, train_dataset, dev_dataset, batch_size=BATCH_SIZE, epochs=EPOCHS)

In [None]:
# show performance results on dev dataset
prediction, outputs = fetch_results(bert_bilstm_net, dev_dataset=dev_dataset, true_labels=valid_df.label)

#### WITH STRATIFIEDKFOLD

In [None]:
df = pd.concat([train_df[[TEXT_COLUMN, LABEL_COLUMN]], valid_df[[TEXT_COLUMN, LABEL_COLUMN]]])
df = df.reset_index(drop=True)

X = df[TEXT_COLUMN]
Y = df[LABEL_COLUMN]

test_dataset, test_encoded = create_test_dataset(test_df[TEXT_COLUMN].values, tokenizer, max_length=MAX_LENGTH)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(X, Y)
fold_num = 0
models = []
oof = 0
for train_index, val_index in skf.split(X, Y):
	fold_num+=1
	print("Results for fold",fold_num)
	x_train, x_val = X.iloc[train_index], X.iloc[val_index]
	y_train, y_val = Y.iloc[train_index], Y.iloc[val_index]

	temp_train = pd.concat([x_train, y_train], axis=1)
	temp_valid = pd.concat([x_val, y_val], axis=1)
	train_dataset, dev_dataset = create_tensor_dataset(temp_train, temp_valid, tokenizer, TEXT_COLUMN, LABEL_COLUMN, max_length=MAX_LENGTH)

	model = bert_bilstm_model(bert_encoder, max_length=MAX_LENGTH)
	train_model(model, train_dataset, dev_dataset, batch_size=BATCH_SIZE, epochs=EPOCHS)
	models.append(model)
	valid_pred = model.predict(dev_dataset)
	f1 = f1_score(y_val, np.argmax(valid_pred, 1), average='macro')
	print(f'{fold_num}. FOLD TEST F1 SCORE: {f1}')

	test_pred = model.predict(test_dataset)
	oof += test_pred
	
	del temp_train, temp_valid, model
	gc.collect()
 
bert_bilstm_test_predictions = np.argmax(oof/5, 1)
# bert_bigru_cnn_test_predictions = np.argmax(oof/5, 1)
# bert_bilstm_attention_test_predictions = np.argmax(oof/5, 1)

### ENSEMBLE 3 DIFFERENT KFOLD MODELS

In [None]:
# show performance results on test dataset
#prediction_bert_bilstm, outputs_bert_bilstm = fetch_results(bert_bilstm_net, dev_dataset=test_dataset, true_labels=None)
#prediction_bert_bigru_cnn, outputs_bert_bigru_cnn = fetch_results(bert_bigru_cnn_net, dev_dataset=test_dataset, true_labels=None)
#prediction_bert_bilstm_attention, outputs_bert_bilstm_attention = fetch_results(bert_bilstm_attention_net, dev_dataset=test_dataset, true_labels=None)

In [None]:
pred_ensemble = 0.39 * bert_bilstm_test_predictions +  0.38 * bert_bigru_cnn_test_predictions + 0.23 * bert_bilstm_attention_test_predictions