In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,f1_score
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import re
from sklearn.neighbors import KNeighborsClassifier
from keras.preprocessing.text import Tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
therapis_responses = pd.read_csv("data/Therapist_responses.csv",delimiter=",")
annotated_data = pd.read_csv("data/Annotated_data.csv",delimiter=",")
annotated_data_copy = pd.merge(therapis_responses,annotated_data,on='Id_Number')
annotated_data_copy = annotated_data_copy.drop(['Question'],axis=1)

In [3]:
def label_distorsion_binary(row):
    if row["Dominant Distortion"] == "No Distortion":
        return 0
    return 1

annotated_data_copy["Distortion"] = annotated_data_copy.apply(label_distorsion_binary, axis=1)

In [4]:
data_list = annotated_data_copy.values.tolist()
# 0 = ANSWER   ,   1 = ID   ,   2 = QUESTION   ,   3 = DISTORTED PART   ,
# 4 = DOMINANT DISTORTION   ,   5 = SECONDARY DISTORTION   ,   6 = DISTORTION 1/0
data_list[0]

['Thank you for writing. You did nothing wrong! You did not put this woman in prison. She did it to herself. She befriended, manipulated and abused a vulnerable 14-year-old (actually, two young teens). She is a sexual offender who was grooming you, not loving you. She belongs in jail.This was never an equal or appropriate relationship. As a teacher, she used her position of authority and the fact you were needy and looked up to her as means to draw you in. She was so good at gaslighting you that you believed and still believe that her abuse was love. She did all the classic moves of an abuser: She gained control of you by making you think you were special. She isolated you, making you more and more dependent on her. She created a relationship where you were always on eggshells, trying not to say or do anything that would cause a fight. She created fights anyway and then made you feel like you were at fault. Apologies and presents followed — which only confused you more. Any time that y

In [5]:
questions = [' '.join((re.sub('[^a-zA-Z]', ' ',el[2].lower())).split()) for el in data_list]
distortions = [el[6] for el in data_list]

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
#sbert_model = SentenceTransformer('all-mpnet-base-v2')

questions_embeded = sbert_model.encode(questions)

X_train, X_test, y_train, y_test = train_test_split(questions_embeded, distortions, test_size=0.2)




In [20]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [17]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense


METRICS = [
	tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model = Sequential() 
model.add(tf.keras.layers.Dense(128, activation='relu',input_shape=(768,)))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(), metrics=METRICS)


model.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 128)               98432     
                                                                 
 dropout_10 (Dropout)        (None, 128)               0         
                                                                 
 dense_17 (Dense)            (None, 128)               16512     
                                                                 
 dropout_11 (Dropout)        (None, 128)               0         
                                                                 
 dense_18 (Dense)            (None, 1)                 129       
                                                                 
Total params: 115,073
Trainable params: 115,073
Non-trainable params: 0
_________________________________________________________________


In [21]:
hist = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
distortions.count(0)

933

In [26]:
distortions.count(1)

1597

In [60]:
questions = [el[2] for el in data_list]
distortions = [el[6] for el in data_list]

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
#sbert_model = SentenceTransformer('all-mpnet-base-v2')

questions_embeded = sbert_model.encode(questions)

X_train, X_test, y_train, y_test = train_test_split(questions_embeded, distortions, test_size=0.2)



In [59]:
classifiers = []
for i in range(1,100):
	classifier = KNeighborsClassifier(n_neighbors=i, p=13, metric='euclidean')
	classifier.fit(X_train,y_train)
	classifiers.append(classifier)
	classifier = KNeighborsClassifier(n_neighbors=i, p=13, metric='cosine')
	classifier.fit(X_train,y_train)
	classifiers.append(classifier)
 
for model in classifiers:
    pred = model.predict(X_test)
    print(f"acc {model.n_neighbors}/{model.metric}: ",accuracy_score(y_test,pred))

acc 1/euclidean:  0.6264822134387352
acc 1/cosine:  0.6284584980237155
acc 2/euclidean:  0.6067193675889329
acc 2/cosine:  0.6007905138339921
acc 3/euclidean:  0.6600790513833992
acc 3/cosine:  0.6600790513833992
acc 4/euclidean:  0.6403162055335968
acc 4/cosine:  0.658102766798419
acc 5/euclidean:  0.6798418972332015
acc 5/cosine:  0.6640316205533597
acc 6/euclidean:  0.6640316205533597
acc 6/cosine:  0.66600790513834
acc 7/euclidean:  0.658102766798419
acc 7/cosine:  0.6640316205533597
acc 8/euclidean:  0.6600790513833992
acc 8/cosine:  0.6620553359683794
acc 9/euclidean:  0.6699604743083004
acc 9/cosine:  0.6699604743083004
acc 10/euclidean:  0.6778656126482213
acc 10/cosine:  0.6758893280632411
acc 11/euclidean:  0.6679841897233202
acc 11/cosine:  0.6699604743083004
acc 12/euclidean:  0.66600790513834
acc 12/cosine:  0.6600790513833992
acc 13/euclidean:  0.6620553359683794
acc 13/cosine:  0.6620553359683794
acc 14/euclidean:  0.6640316205533597
acc 14/cosine:  0.6600790513833992
ac

In [61]:
classifiers = []
for i in range(1,100):
	classifier = KNeighborsClassifier(n_neighbors=i, p=13, metric='euclidean')
	classifier.fit(X_train,y_train)
	classifiers.append(classifier)
	classifier = KNeighborsClassifier(n_neighbors=i, p=13, metric='cosine')
	classifier.fit(X_train,y_train)
	classifiers.append(classifier)
 
for model in classifiers:
    pred = model.predict(X_test)
    print(f"acc {model.n_neighbors}/{model.metric}: ",accuracy_score(y_test,pred))

acc 1/euclidean:  0.6245059288537549
acc 1/cosine:  0.616600790513834
acc 2/euclidean:  0.5790513833992095
acc 2/cosine:  0.5830039525691699
acc 3/euclidean:  0.6304347826086957
acc 3/cosine:  0.6245059288537549
acc 4/euclidean:  0.6245059288537549
acc 4/cosine:  0.6284584980237155
acc 5/euclidean:  0.6541501976284585
acc 5/cosine:  0.6482213438735178
acc 6/euclidean:  0.6403162055335968
acc 6/cosine:  0.642292490118577
acc 7/euclidean:  0.642292490118577
acc 7/cosine:  0.6541501976284585
acc 8/euclidean:  0.6442687747035574
acc 8/cosine:  0.6561264822134387
acc 9/euclidean:  0.6462450592885376
acc 9/cosine:  0.6442687747035574
acc 10/euclidean:  0.6363636363636364
acc 10/cosine:  0.6462450592885376
acc 11/euclidean:  0.6403162055335968
acc 11/cosine:  0.6363636363636364
acc 12/euclidean:  0.6343873517786561
acc 12/cosine:  0.6343873517786561
acc 13/euclidean:  0.6442687747035574
acc 13/cosine:  0.6403162055335968
acc 14/euclidean:  0.6462450592885376
acc 14/cosine:  0.642292490118577


In [75]:
def remove_stop_words(text):
    text = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word.isalpha() and not word in stop_words]
    return ' '.join(text)

texts = [el[2] for el in data_list]

distortions = [el[6] for el in data_list]
questions = list(map(remove_stop_words,texts))

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
#sbert_model = SentenceTransformer('all-mpnet-base-v2')

questions_embeded = sbert_model.encode(questions)

X_train, X_test, y_train, y_test = train_test_split(questions_embeded, distortions, test_size=0.2)



In [76]:
classifiers = []
for i in range(1,100):
	classifier = KNeighborsClassifier(n_neighbors=i, p=13, metric='euclidean')
	classifier.fit(X_train,y_train)
	classifiers.append(classifier)
	classifier = KNeighborsClassifier(n_neighbors=i, p=13, metric='cosine')
	classifier.fit(X_train,y_train)
	classifiers.append(classifier)
 
for model in classifiers:
    pred = model.predict(X_test)
    print(f"acc {model.n_neighbors}/{model.metric}: ",accuracy_score(y_test,pred))

acc 1/euclidean:  0.6640316205533597
acc 1/cosine:  0.6719367588932806
acc 2/euclidean:  0.6225296442687747
acc 2/cosine:  0.6245059288537549
acc 3/euclidean:  0.6699604743083004
acc 3/cosine:  0.6778656126482213
acc 4/euclidean:  0.6818181818181818
acc 4/cosine:  0.6877470355731226
acc 5/euclidean:  0.6719367588932806
acc 5/cosine:  0.6719367588932806
acc 6/euclidean:  0.6956521739130435
acc 6/cosine:  0.6857707509881423
acc 7/euclidean:  0.6818181818181818
acc 7/cosine:  0.6778656126482213
acc 8/euclidean:  0.7015810276679841
acc 8/cosine:  0.6818181818181818
acc 9/euclidean:  0.6679841897233202
acc 9/cosine:  0.6620553359683794
acc 10/euclidean:  0.66600790513834
acc 10/cosine:  0.6798418972332015
acc 11/euclidean:  0.66600790513834
acc 11/cosine:  0.6620553359683794
acc 12/euclidean:  0.6837944664031621
acc 12/cosine:  0.6837944664031621
acc 13/euclidean:  0.6699604743083004
acc 13/cosine:  0.6778656126482213
acc 14/euclidean:  0.6818181818181818
acc 14/cosine:  0.6778656126482213


In [31]:
predicted

array([[0.4849271 ],
       [0.52953875],
       [0.8346922 ],
       [0.62169254],
       [0.7272236 ],
       [0.7676399 ],
       [0.96699345],
       [0.6933755 ],
       [0.6765701 ],
       [0.8144874 ],
       [0.45595375],
       [0.728874  ],
       [0.7913272 ],
       [0.9781425 ],
       [0.78560567],
       [0.5860719 ],
       [0.96007365],
       [0.7512484 ],
       [0.89301556],
       [0.62584496],
       [0.8760661 ],
       [0.00391083],
       [0.55173826],
       [0.9548556 ],
       [0.8965072 ],
       [0.8040059 ],
       [0.6184691 ],
       [0.77206016],
       [0.71879756],
       [0.9488163 ],
       [0.44598526],
       [0.63923395],
       [0.60754454],
       [0.24079126],
       [0.5175791 ],
       [0.85308504],
       [0.5913644 ],
       [0.9240685 ],
       [0.25043783],
       [0.99402255],
       [0.9916551 ],
       [0.37664092],
       [0.93840325],
       [0.9803491 ],
       [0.7134445 ],
       [0.14731732],
       [0.00482014],
       [0.495

In [30]:
predicted = model.predict(X_test)

#print("acc: ", accuracy_score(y_test, np.array(predicted)))
#print("f1: ",f1_score(y_test, predicted,average=None))

cm = confusion_matrix(y_test,predicted)
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()



ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [74]:
distortions.count(0)

933