bert ann

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub
from matplotlib import pyplot as plt
import seaborn as sn
import numpy as np
import nltk
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Dropout
from keras.layers import Embedding
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import KNeighborsClassifier

nltk.download('stopwords')

In [None]:
def plot_train_val_data(model,labels,validation=False):
	fig_size_w = len(labels)
 
	if validation == True:
		val_to_plot = [model.history["val_"+el] for el in labels]
	to_plot = [model.history[el] for el in labels]

	epochs = range(1, len(to_plot[0]) + 1)

	fig, axes = plt.subplots(1, fig_size_w,figsize=(20, 5))
	fig.tight_layout() 
	for i in range(0, fig_size_w):
		axes[i].plot(epochs, to_plot[i], '-', label=labels[i])
		if(validation == True):
			axes[i].plot(epochs, val_to_plot[i], ':', label="Validation "+labels[i])
		axes[i].set_title(labels[i],fontsize=20)
		axes[i].legend(loc='lower right')

	plt.show()

In [None]:
preprocess_link = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_link = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4"

preprocessor = hub.KerasLayer(preprocess_link)
encoder = hub.KerasLayer(encoder_link)

def get_embedings_sentences(sentences):
	preprocessed_text = preprocessor(sentences)
	return encoder(preprocessed_text)["pooled_output"]
	#return encoder(preprocessed_text)["sequence_output"]
 
get_embedings_sentences(["Hello fellow humans!"]) 

In [None]:
def remove_stop_words(text):
    text = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word.isalpha() and not word in stop_words]
    return ' '.join(text)

def find_max_list(list):
    list_len = [len(i) for i in list]
    return max(list_len)

In [None]:
def label_distorsion_binary(row):
    if row["Dominant Distortion"] == "No Distortion":
        return 0
    return 1

therapis_responses = pd.read_csv("data/Therapist_responses.csv",delimiter=",")
annotated_data = pd.read_csv("data/Annotated_data.csv",delimiter=",")
data = pd.merge(therapis_responses,annotated_data,on='Id_Number').drop(["Question"], axis=1)
data["Distortion"] = data.apply(label_distorsion_binary, axis=1)
data_list = data.values.tolist()
# 0 = ANSWER   ,   1 = ID   ,   2 = QUESTION   ,   3 = DISTORTED PART   ,
# 4 = DOMINANT DISTORTION   ,   5 = SECONDARY DISTORTION   ,   6 = DISTORTION 1/0

# ----------------------------------------------------------------------

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data["Patient Question"], data["Distortion"])

In [None]:
X_train_sw, X_test_sw, Y_train_sw, Y_test_sw = train_test_split(list(map(remove_stop_words,data["Patient Question"])), data["Distortion"])

In [None]:
METRICS = [
	tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

# ------------------------------------------------

In [None]:


#BERT
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string,name="text")
preprocessed_text = preprocessor(text_input)
outputs = encoder(preprocessed_text)
#NN
#layer = tf.keras.layers.Dense(10,activation="relu")(outputs["pooled_output"])
#layer = tf.keras.layers.Dropout(0.25,name="dropout")(layer)
#layer = tf.keras.layers.Dense(1,activation="sigmoid",name="output")(layer)
layer = tf.keras.layers.Dropout(0.05,name="dropout")(outputs["pooled_output"])
layer = tf.keras.layers.Dense(1,activation="sigmoid",name="output")(layer)

#Model
model = tf.keras.Model(inputs=[text_input],outputs=[layer])

model.summary()




In [None]:
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=METRICS)

history = model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=2)

In [None]:
plot_train_val_data(history,["loss","accuracy","precision","recall"],True)


# ------------------------------------------------

In [None]:
sentences = data["Patient Question"].tolist()
sentences = list(map(remove_stop_words,sentences))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

VOCAB_LEN = len(tokenizer.word_index) + 1
#MAX_LEN = find_max_list(sequences)
MAX_LEN = 2000

padded_sequences = pad_sequences(sequences,maxlen=MAX_LEN, padding='post')

#tokenizer.sequences_to_texts(padded_sequences)[:3]

X_train_sw_tokenizer, X_test_sw_tokenizer, Y_train_sw_tokenizer, Y_test_sw_tokenizer = train_test_split(padded_sequences, np.array(data["Distortion"]))

# ANN
model = Sequential() 
n_dim = 2
model.add(Embedding(VOCAB_LEN, n_dim, input_length=MAX_LEN))#Vocabulary size of Tokenizer / Number of dimensions in embedding space / Length of padded sequence
model.add(Flatten())
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)
model.summary()
hist = model.fit(X_train_sw_tokenizer,Y_train_sw_tokenizer,validation_data=(X_test_sw_tokenizer,Y_test_sw_tokenizer),epochs=2)


# CNN
n_dim = 2
seq_len = 3
model2 = Sequential()
model2.add(Embedding(VOCAB_LEN, n_dim, input_length=MAX_LEN))
model2.add(Conv1D(n_dim, seq_len, activation='relu'))
model2.add(MaxPooling1D(5))
model2.add(Conv1D(n_dim, seq_len, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)
model2.summary()
hist2 = model2.fit(X_train_sw_tokenizer,Y_train_sw_tokenizer,validation_data=(X_test_sw_tokenizer,Y_test_sw_tokenizer),epochs=2)

In [None]:
plot_train_val_data(hist,["loss","accuracy","precision","recall"],True)
plot_train_val_data(hist2,["loss","accuracy","precision","recall"],True)


# ------------------------------------------------

In [None]:
questions = data["Patient Question"].tolist()
distortions = data["Distortion"].tolist()

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
#sbert_model = SentenceTransformer('all-mpnet-base-v2')

questions_embeded = sbert_model.encode(questions)

X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(questions_embeded, distortions, test_size=0.2)

classifier = KNeighborsClassifier(n_neighbors=7, p=13, metric='euclidean')
classifier.fit(X_train_bert,y_train_bert)

classifier.score(X_test_bert,y_test_bert)

y_predicted = classifier.predict(X_test_bert)
predicted = [np.argmax(el) for el in y_predicted]
cm = confusion_matrix(y_test_bert, predicted)
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
questions_stopword = list(map(remove_stop_words,questions))

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
#sbert_model = SentenceTransformer('all-mpnet-base-v2')

questions_stopword_embeded = sbert_model.encode(questions)

X_train_sw_bert, X_test_sw_bert, y_train_sw_bert, y_test_sw_bert = train_test_split(questions_stopword_embeded, distortions, test_size=0.2)

classifier2 = KNeighborsClassifier(n_neighbors=7, p=13, metric='euclidean')
classifier2.fit(X_train_sw_bert,y_train_sw_bert)

classifier2.score(X_test_sw_bert,y_test_sw_bert)

y_predicted = classifier2.predict(X_test_sw_bert)
predicted = [np.argmax(el) for el in y_predicted]
cm = confusion_matrix(y_test_sw_bert, predicted)
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
model = Sequential() 
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(), metrics=METRICS)

input_shape = X_train_bert.shape
model.build(input_shape)

model.summary()

hist = model.fit(X_train_bert,np.array(y_train_bert),validation_data=(X_test_bert,np.array(y_test_bert)),epochs=10)

In [None]:
plot_train_val_data(hist,["loss","accuracy","precision","recall"],True)

In [None]:
y_predicted = model.predict(X_test_bert)
predicted = [np.argmax(el) for el in y_predicted]
cm = confusion_matrix(y_test_bert, predicted)
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')