In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from matplotlib import pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

In [None]:
therapis_responses = pd.read_csv("data/Therapist_responses.csv",delimiter=",")
annotated_data = pd.read_csv("data/Annotated_data.csv",delimiter=",")
annotated_data_copy = pd.read_csv("data/Annotated_data.csv",delimiter=",")
merged_data = pd.merge(therapis_responses,annotated_data,on='Id_Number')

In [None]:
def label_distorsion_binary(row):
    if row["Dominant Distortion"] == "No Distortion":
        return 0
    return 1

annotated_data_copy["Distortion"] = annotated_data_copy.apply(label_distorsion_binary, axis=1)
annotated_data_copy = annotated_data_copy.drop(['Distorted part','Dominant Distortion','Secondary Distortion (Optional)'],axis=1)

In [None]:
annotated_data_copy.groupby("Distortion").describe()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(annotated_data_copy["Patient Question"], annotated_data_copy["Distortion"])

In [None]:
preprocess_link = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_link = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4"

preprocessor = hub.KerasLayer(preprocess_link)
encoder = hub.KerasLayer(encoder_link)

In [None]:
def get_embedings_sentences(sentences):
	preprocessed_text = preprocessor(sentences)
	return encoder(preprocessed_text)["pooled_output"]
	#return encoder(preprocessed_text)["sequence_output"]
 
get_embedings_sentences(["Hello fellow humans!"]) 

In [None]:

#BERT
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string,name="text")
preprocessed_text = preprocessor(text_input)
outputs = encoder(preprocessed_text)
#NN
layer = tf.keras.layers.Dropout(0.05,name="dropout")(outputs["pooled_output"])
layer = tf.keras.layers.Dense(1,activation="sigmoid",name="output")(layer)
#Model
model = tf.keras.Model(inputs=[text_input],outputs=[layer])

model.summary()

In [None]:
METRICS = [
	tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=METRICS)

In [None]:
#history = model.fit(X_train,Y_train,epochs=25)
history = model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=5)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
 
acc = history.history['accuracy']
val = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)
 
plt.plot(epochs, acc, '-', label='Training accuracy')
plt.plot(epochs, val, '-', label='Validation accuracy')
plt.plot(epochs,loss,':',label="Training loss")
plt.plot(epochs,val_loss,':',label="Validation loss")
plt.title('Training and Validation Accuracy/Loss')
plt.xlabel('Epoch')
plt.legend(loc='lower right')
plt.plot()

In [None]:
loss, = plt.plot(history.history['loss'], label = 'loss')
acc, = plt.plot(history.history['accuracy'], label = 'acc')
plt.legend([loss,acc], ['Train Loss', 'Train Acc'])
plt.xlabel('Batch #')
plt.ylabel('Loss & acc')

In [None]:
model.evaluate(X_test,Y_test)

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5,1,0)
cm = confusion_matrix(Y_test, y_predicted)
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')