In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from matplotlib import pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

In [None]:
def plot_train_val_data(model,labels,validation=False):
	fig_size_w = len(labels)
 
	if isinstance(model,list):
		istoric = merge_history(model)
		if validation == True:
			val_to_plot = [istoric["val_"+el] for el in labels]
		to_plot = [istoric[el] for el in labels]
	else:
		if validation == True:
			val_to_plot = [model.history["val_"+el] for el in labels]
		to_plot = [model.history[el] for el in labels]

	epochs = range(1, len(to_plot[0]) + 1)

	fig, axes = plt.subplots(1, fig_size_w,figsize=(20, 5))
	fig.tight_layout() 
	for i in range(0, fig_size_w):
		axes[i].plot(epochs, to_plot[i], '-', label=labels[i])
		if(validation == True):
			axes[i].plot(epochs, val_to_plot[i], ':', label="Validation "+labels[i])
		axes[i].set_title(labels[i],fontsize=20)
		axes[i].legend(loc='lower right')

	plt.show()

In [None]:
therapis_responses = pd.read_csv("../data/Therapist_responses.csv",delimiter=",")
annotated_data = pd.read_csv("../data/Annotated_data.csv",delimiter=",")
annotated_data_copy = pd.merge(therapis_responses,annotated_data,on='Id_Number')
annotated_data_copy = annotated_data_copy.drop(['Question'],axis=1)

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stop_words(text):
    text = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word.isalpha() and not word in stop_words]
    return ' '.join(text)

texts = annotated_data_copy["Patient Question"].tolist()

sentences = list(map(remove_stop_words,texts))

def label_distorsion_binary(row):
    if row["Dominant Distortion"] == "No Distortion":
        return 0
    return 1

annotated_data_copy["Distortion"] = annotated_data_copy.apply(label_distorsion_binary, axis=1)

distortions = annotated_data_copy["Distortion"].tolist()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(sentences, distortions)

In [None]:
preprocess_link = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_link = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4"

preprocessor = hub.KerasLayer(preprocess_link)
encoder = hub.KerasLayer(encoder_link)

In [None]:
def get_embedings_sentences(sentences):
	preprocessed_text = preprocessor(sentences)
	return encoder(preprocessed_text)["pooled_output"]
	#return encoder(preprocessed_text)["sequence_output"]
 
get_embedings_sentences(["Hello fellow humans!"]) 

In [None]:
base_model = MobileNet(weights='imagenet')

text_input = tf.keras.layers.Input(shape=(),dtype=tf.string,name="text")
preprocessed_text = preprocessor(text_input)
outputs = encoder(preprocessed_text)

x = base_model.output

layer = tf.keras.layers.Dense(10,activation="relu")(outputs["pooled_output"])
layer = tf.keras.layers.Dropout(0.25,name="dropout")(layer)
preds = tf.keras.layers.Dense(1,activation="sigmoid",name="output")(layer)
#Model
model = tf.keras.Model(inputs=[text_input],outputs=[layer])

model.summary()

In [None]:
METRICS = [
	tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=METRICS)

In [None]:
history = model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=10)

In [None]:
plot_train_val_data(history,["accuracy","precision","recall"],True)

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5,1,0)
cm = confusion_matrix(Y_test, y_predicted)
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')