In [38]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import pandas as pd

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
from sklearn.utils import class_weight

## Model to combine the output of Naive Bayes Classifier, and read the pooled output from BERT Model

In [39]:
from transformers import BertTokenizer, TFBertModel
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [40]:
# bring in the data
questions_data = pd.read_csv("questions_nb_predictions.csv")

In [41]:
from sklearn.utils import shuffle
questions_data = shuffle(questions_data)

In [42]:
questions_data = questions_data.head(50) # just doing small subset to make it work at first

In [48]:
train_df = questions_data.loc[questions_data['SPLIT'] == 'TRAIN']
test_df = questions_data.loc[questions_data['SPLIT'] == 'TEST']

In [49]:
def make_input(title, nb_prob):
    title = row['Title']
    tokenized_output = bert_tokenizer([text],
        truncation=True,
        padding='max_length', 
        return_tensors='tf')
    
    bert_encoding = bert_model(
        tokenized_output
    )

In [29]:
train_titles = list(train['Title'])
train_labels = list(train['has_positive_answer'])

In [30]:
test_titles = list(test['Title'])
test_labels = list(test['has_positive_answer'])

In [31]:
class_weights_dict = {0: 1.5157873389285825, 1: 0.7461145068648831}

In [32]:
max_length = 20

In [35]:
def get_pooled_output(text):
    tokenized_output = bert_tokenizer([text],
        max_length=20,
        truncation=True,
        padding='max_length', 
        return_tensors='tf')
    
    bert_encoding = bert_model(
        tokenized_output
    )
    
    return bert_encoding[1].numpy()[0].tolist()

In [37]:
train_data = {}
test_data = {}