In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import re
import numpy as np
import pandas as pd
from chart_studio import plotly as py
import plotly.figure_factory as ff
from scipy import stats

import gensim
import json


import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import itertools

import keras

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Activation, GlobalAveragePooling1D, Flatten, Concatenate, Conv1D, MaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate
from keras.optimizers import SGD, RMSprop, Adagrad, Adam
from keras.preprocessing.text import one_hot, text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.vis_utils import plot_model

import fnmatch

import warnings

import string
from pathlib import Path
from random import shuffle
from ast import literal_eval

warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:

WINDOWS_SIZE = 10
labels=['none','mild','moderate','moderately severe', 'severe']
num_classes = len(labels)

In [4]:
def plot_acc(history, title="Model Accuracy"):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title(title)
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper left')
    plt.show()
    
def plot_loss(history, title="Model Loss"):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(title)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper right')
    plt.show()
    
def plot_compare_losses(history1, history2, name1="Red 1", name2="Red 2", title="Graph title"):
    plt.plot(history1.history['loss'], color="green")
    plt.plot(history1.history['val_loss'], 'r--', color="green")
    plt.plot(history2.history['loss'], color="blue")
    plt.plot(history2.history['val_loss'], 'r--', color="blue")
    plt.title(title)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train ' + name1, 'Val ' + name1, 
                'Train ' + name2, 'Val ' + name2],
               loc='upper right')
    plt.show()
    
def plot_compare_accs(history1, history2, name1="Red 1",
                      name2="Red 2", title="Graph title"):
    """Compara accuracies de dos entrenamientos con nombres name1 y name2"""
    plt.plot(history1.history['acc'], color="green")
    plt.plot(history1.history['val_acc'], 'r--', color="green")
    plt.plot(history2.history['acc'], color="blue")
    plt.plot(history2.history['val_acc'], 'r--', color="blue")
    plt.title(title)
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train ' + name1, 'Val ' + name1, 
                'Train ' + name2, 'Val ' + name2], 
               loc='lower right')
    plt.show()

def plot_compare_multiple_metrics(history_array, names, colors, title="Graph title", metric='acc'):  
    legend = []
    for i in range(0, len(history_array)):
        plt.plot(history_array[i].history[metric], color=colors[i])
        plt.plot(history_array[i].history['val_' + metric], 'r--', color=colors[i])
        legend.append('Train ' + names[i])
        legend.append('Val ' + names[i])
    
    plt.title(title)
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')   
    plt.axis
    plt.legend(legend, 
               loc='lower right')
    plt.show()

In [5]:
def transcripts_to_dataframe(directory):
    rows_list = []
        
    filenames = os.listdir(directory)
    
    if ".DS_Store" in filenames:
        filenames.remove(".DS_Store")
        
    for filename in filenames:
        print(filename)
        transcript_path = os.path.join(directory, filename)
        transcript = pd.read_csv(transcript_path, sep='\t')
        m = re.search("(\d{3})_TRANSCRIPT.csv", filename)
        if m:
            person_id = m.group(1)
            p = {}
            question = ""
            answer = ""
            lines = len(transcript)
            for i in range(0, lines):
                row = transcript.iloc[i]
                if (row["speaker"] == "Ellie") or (i == lines - 1):
                    p["personId"] = person_id
                    if "(" in str(question):
                        question = question[question.index("(") + 1:question.index(")")]
                    p["question"] = question
                    p["answer"] = answer
                    if question != "":
                        rows_list.append(p)
                    p = {}
                    answer = ""
                    question = row["value"]
                else:
                    answer = str(answer) + " " + str(row["value"])

    all_participants = pd.DataFrame(rows_list, columns=['personId', 'question', 'answer'])
    all_participants.to_csv(directory + 'all.csv', sep=',')
    print("File was created")
    return all_participants

In [6]:
data_path = "F:/DIAC-WOZ/transcripts/"
transcripts_to_dataframe(data_path) 
all_participants = pd.read_csv(data_path + 'all.csv', sep=',')

300_TRANSCRIPT.csv
301_TRANSCRIPT.csv
302_TRANSCRIPT.csv
303_TRANSCRIPT.csv
304_TRANSCRIPT.csv
305_TRANSCRIPT.csv
306_TRANSCRIPT.csv
307_TRANSCRIPT.csv
308_TRANSCRIPT.csv
309_TRANSCRIPT.csv
310_TRANSCRIPT.csv
311_TRANSCRIPT.csv
312_TRANSCRIPT.csv
313_TRANSCRIPT.csv
314_TRANSCRIPT.csv
315_TRANSCRIPT.csv
316_TRANSCRIPT.csv
317_TRANSCRIPT.csv
318_TRANSCRIPT.csv
319_TRANSCRIPT.csv
320_TRANSCRIPT.csv
321_TRANSCRIPT.csv
322_TRANSCRIPT.csv
323_TRANSCRIPT.csv
324_TRANSCRIPT.csv
325_TRANSCRIPT.csv
326_TRANSCRIPT.csv
327_TRANSCRIPT.csv
328_TRANSCRIPT.csv
329_TRANSCRIPT.csv
330_TRANSCRIPT.csv
331_TRANSCRIPT.csv
332_TRANSCRIPT.csv
333_TRANSCRIPT.csv
334_TRANSCRIPT.csv
335_TRANSCRIPT.csv
336_TRANSCRIPT.csv
337_TRANSCRIPT.csv
338_TRANSCRIPT.csv
339_TRANSCRIPT.csv
340_TRANSCRIPT.csv
341_TRANSCRIPT.csv
343_TRANSCRIPT.csv
344_TRANSCRIPT.csv
345_TRANSCRIPT.csv
346_TRANSCRIPT.csv
347_TRANSCRIPT.csv
348_TRANSCRIPT.csv
349_TRANSCRIPT.csv
350_TRANSCRIPT.csv
351_TRANSCRIPT.csv
352_TRANSCRIPT.csv
353_TRANSCRI

In [7]:
all_participants.head()

Unnamed: 0.1,Unnamed: 0,personId,question,answer
0,0,300,hi i'm ellie thanks for coming in today,
1,1,300,i was created to talk to people in a safe and ...,
2,2,300,think of me as a friend i don't judge i can't ...,
3,3,300,i'm here to learn about people and would love ...,
4,4,300,i'll ask a few questions to get us started and...,


In [8]:
all_participants.columns =  ['index','personId', 'question', 'answer']
all_participants = all_participants.astype({"index": int, "personId": float, "question": str, "answer": str })

In [9]:
def text_to_wordlist(text, remove_stopwords=True, stem_words=False):    
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [wordnet_lemmatizer.lemmatize(w) for w in text if not w in stops ]
        text = [w for w in text if w != "nan" ]
    else:
        text = [wordnet_lemmatizer.lemmatize(w) for w in text]
        text = [w for w in text if w != "nan" ]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    
    text = re.sub(r"\<", " ", text)
    text = re.sub(r"\>", " ", text)
    
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\welcome\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
all_participants_mix = all_participants.copy()
all_participants_mix['answer'] = all_participants_mix.apply(lambda row: text_to_wordlist(row.answer).split(), axis=1)

In [12]:
all_participants_mix_stopwords = all_participants.copy()
all_participants_mix_stopwords['answer'] = all_participants_mix_stopwords.apply(lambda row: text_to_wordlist(row.answer, remove_stopwords=False).split(), axis=1)

In [13]:
words = [w for w in all_participants_mix['answer'].tolist()]
words = set(itertools.chain(*words))
vocab_size = len(words)

In [14]:
words_stop = [w for w in all_participants_mix_stopwords['answer'].tolist()]
words_stop = set(itertools.chain(*words_stop))
vocab_size_stop = len(words_stop)

In [15]:
windows_size = WINDOWS_SIZE
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(all_participants_mix['answer'])
tokenizer.fit_on_sequences(all_participants_mix['answer'])

all_participants_mix['t_answer'] = tokenizer.texts_to_sequences(all_participants_mix['answer'])
print(all_participants_mix)

       index  personId                                           question  \
0          0     300.0            hi i'm ellie thanks for coming in today   
1          1     300.0  i was created to talk to people in a safe and ...   
2          2     300.0  think of me as a friend i don't judge i can't ...   
3          3     300.0  i'm here to learn about people and would love ...   
4          4     300.0  i'll ask a few questions to get us started and...   
5          5     300.0                            how are you doing today   
6          6     300.0                                        that's good   
7          7     300.0                      where are you from originally   
8          8     300.0                                             really   
9          9     300.0                              why'd you move to l_a   
10        10     300.0                                how do you like l_a   
11        11     300.0     what are some things you really like about l_a   

In [16]:
windows_size = WINDOWS_SIZE
tokenizer = Tokenizer(num_words=vocab_size_stop)
tokenizer.fit_on_texts(all_participants_mix_stopwords['answer'])
tokenizer.fit_on_sequences(all_participants_mix_stopwords['answer'])

all_participants_mix_stopwords['t_answer'] = tokenizer.texts_to_sequences(all_participants_mix_stopwords['answer'])
all_participants_mix_stopwords.head()

Unnamed: 0,index,personId,question,answer,t_answer
0,0,300.0,hi i'm ellie thanks for coming in today,[],[]
1,1,300.0,i was created to talk to people in a safe and ...,[],[]
2,2,300.0,think of me as a friend i don't judge i can't ...,[],[]
3,3,300.0,i'm here to learn about people and would love ...,[],[]
4,4,300.0,i'll ask a few questions to get us started and...,[],[]


In [17]:

word_index = tokenizer.word_index
word_size = len(word_index)
print(word_index["sad"])

652


In [18]:
windows_size = WINDOWS_SIZE
cont = 0
word_index = tokenizer
phrases_lp_stop = pd.DataFrame(columns=['personId','answer', 't_answer'])
answers = all_participants_mix_stopwords.groupby('personId').agg('sum', axis=1)

print(answers["answer"])

personId
300.0    [good, atlanta, georgia, um, my, parent, are, ...
301.0    [thank, you, mmm, k, i, am, doing, good, thank...
302.0    [i, am, fine, how, about, yourself, i, am, fro...
303.0    [okay, how, bout, yourself, here, in, californ...
304.0    [i, am, doing, good, um, from, los, angeles, c...
305.0    [i, am, doing, alright, uh, originally, i, am,...
306.0    [fine, uh, colorado, mhm, uh, career, career, ...
307.0    [laughter, um, moscow, um, my, family, moved, ...
308.0    [los, angeles, california, yes, um, the, south...
309.0    [laughter, laughter, yeah, laughter, laughter,...
310.0    [yes, it, okay, laughter, fine, laughter, i, l...
311.0    [yes, okay, uh, when, would, i, move, to, l, a...
312.0    [yes, fine, how, about, you, here, yes, the, w...
313.0    [sure, uh, okay, i, guess, not, bad, not, good...
314.0    [yes, quite, well, feel, good, los, angeles, c...
315.0    [alright, yes, okay, and, you, inglewood, cali...
316.0    [yes, i, am, fine, new, york, uh, for,

In [19]:
for p in answers.iterrows():      
    words = p[1]["answer"]
    size = len(words)
    word_tokens = p[1]["t_answer"]
 
    for i in range(size):
        sentence = words[i:min(i+windows_size,size)]  
        tokens = word_tokens[i:min(i+windows_size,size)]  
        phrases_lp_stop.loc[cont] = [p[0], sentence, tokens]
        cont = cont + 1

In [21]:
print(phrases_lp_stop)

        personId                                             answer  \
0          300.0  [good, atlanta, georgia, um, my, parent, are, ...   
1          300.0  [atlanta, georgia, um, my, parent, are, from, ...   
2          300.0  [georgia, um, my, parent, are, from, here, um,...   
3          300.0  [um, my, parent, are, from, here, um, i, love,...   
4          300.0  [my, parent, are, from, here, um, i, love, it, i]   
5          300.0  [parent, are, from, here, um, i, love, it, i, ...   
6          300.0   [are, from, here, um, i, love, it, i, like, the]   
7          300.0  [from, here, um, i, love, it, i, like, the, we...   
8          300.0  [here, um, i, love, it, i, like, the, weather, i]   
9          300.0  [um, i, love, it, i, like, the, weather, i, like]   
10         300.0  [i, love, it, i, like, the, weather, i, like, ...   
11         300.0  [love, it, i, like, the, weather, i, like, the...   
12         300.0  [it, i, like, the, weather, i, like, the, oppo...   
13    

In [25]:
windows_size = WINDOWS_SIZE
cont = 0
word_index = tokenizer
phrases_lp = pd.DataFrame(columns=['personId','answer', 't_answer'])
answers = all_participants_mix.groupby('personId').agg('sum', axis=1)

for p in answers.iterrows():      
    print(p)
    words = p[1]["answer"]
    size = len(words)
    word_tokens = p[1]["t_answer"]
 
    for i in range(size):
        sentence = words[i:min(i+windows_size,size)]  
        tokens = word_tokens[i:min(i+windows_size,size)]  
        phrases_lp.loc[cont] = [p[0], sentence, tokens]
        cont = cont + 1

(300.0, index                                                    3741
question    hi i'm ellie thanks for coming in todayi was c...
answer      [good, atlanta, georgia, um, parent, um, love,...
t_answer    [16, 1634, 1997, 1, 131, 1, 63, 5, 142, 5, 334...
Name: 300.0, dtype: object)
(301.0, index                                                    9625
question    hi i'm ellie thanks for coming in today i was ...
answer      [thank, mmm, k, i, am, good, thank, i, am, los...
t_answer    [173, 2246, 1199, 3, 6, 16, 173, 3, 6, 224, 23...
Name: 301.0, dtype: object)
(302.0, index                                                   18512
question    hi i'm ellie thanks for coming in todayi was c...
answer      [i, am, fine, i, am, los, angeles, california,...
t_answer    [3, 6, 283, 3, 6, 224, 238, 234, 153, 64, 1, 6...
Name: 302.0, dtype: object)
(303.0, index                                                   26092
question    hi i'm ellie thanks for coming in today i was ...
answer      [oka

(329.0, index                                                  208516
question    hi i'm ellie thanks for coming in todayi was c...
answer      [yes, pretty, well, new, york, uh, go, college...
t_answer    [39, 23, 15, 76, 420, 2, 21, 180, 15, 466, 155...
Name: 329.0, dtype: object)
(330.0, index                                                  245564
question    hi i'm ellie thanks for coming in todayi was c...
answer      [yes, i, am, okay, good, l, a, sunny, uh, near...
t_answer    [39, 3, 6, 64, 16, 112, 94, 1340, 2, 1165, 641...
Name: 330.0, dtype: object)
(331.0, index                                                  294321
question    hi i'm ellie thanks for coming in todayi was c...
answer      [yes, okay, connecticut, um, actor, laughter, ...
t_answer    [39, 64, 2771, 1, 1124, 8, 15, 245, 466, 898, ...
Name: 331.0, dtype: object)
(332.0, index                                                  286512
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes

(359.0, index                                                  473985
question    hi i'm ellie thanks for coming in todayi was c...
answer      [yes, laughter, uh, get, much, sleep, last, ni...
t_answer    [39, 8, 2, 18, 32, 77, 73, 145, 152, 667, 3, 6...
Name: 359.0, dtype: object)
(360.0, index                                                  218612
question    hi i'm ellie thanks for coming in todayi was c...
answer      [yes, i, am, good, inglewood, scenery, place, ...
t_answer    [39, 3, 6, 16, 2320, 864, 110, 398, 1, 63, 112...
Name: 360.0, dtype: object)
(361.0, index                                                  231082
question    hi i'm ellie thanks for coming in todayi was c...
answer      [yes, i, am, great, i, am, born, american, bor...
t_answer    [39, 3, 6, 84, 3, 6, 336, 544, 336, 2, 2803, 8...
Name: 361.0, dtype: object)
(362.0, index                                                  271025
question    hi i'm ellie thanks for coming in todayi was c...
answer      [yes

(388.0, index                                                  564186
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes, alright, passaic, new, jersey, yes, uh, ...
t_answer    [39, 422, 5628, 76, 2176, 39, 2, 212, 22, 58, ...
Name: 388.0, dtype: object)
(389.0, index                                                  660484
question    hi i'm ellie thanks for coming in today i was ...
answer      [sure, okay, small, town, outside, cinncinatti...
t_answer    [97, 64, 327, 650, 410, 5631, 1697, 138, 22, 5...
Name: 389.0, dtype: object)
(390.0, index                                                  652955
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes, i, am, pretty, well, um, stressful, circ...
t_answer    [39, 3, 6, 23, 15, 1, 570, 865, 40, 1, 1, 47, ...
Name: 390.0, dtype: object)
(391.0, index                                                  575475
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes

(419.0, index                                                  601083
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes, i, am, good, thank, um, born, mexico, ci...
t_answer    [39, 3, 6, 16, 173, 1, 336, 560, 230, 39, 1, 4...
Name: 419.0, dtype: object)
(420.0, index                                                  894567
question    hi i'm ellie thanks for coming in today i was ...
answer      [mm, yes, um, i, am, okay, chicago, um, back, ...
t_answer    [33, 39, 1, 3, 6, 64, 850, 1, 43, 1000, 22, 33...
Name: 420.0, dtype: object)
(421.0, index                                                  815430
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes, that, fine, i, am, okay, laughter, origi...
t_answer    [39, 20, 283, 3, 6, 64, 8, 531, 3, 6, 466, 898...
Name: 421.0, dtype: object)
(422.0, index                                                  684915
question    hi i'm ellie thanks for coming in today i was ...
answer      [hi,

(448.0, index                                                  647130
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes, pretty, good, oh, born, uh, brooklyn, ne...
t_answer    [39, 23, 16, 46, 336, 2, 6611, 76, 420, 231, 3...
Name: 448.0, dtype: object)
(449.0, index                                                  721264
question    hi i'm ellie thanks for coming in today i was ...
answer      [okay, yes, i, am, good, thank, laughter, orig...
t_answer    [64, 39, 3, 6, 16, 173, 8, 531, 224, 238, 131,...
Name: 449.0, dtype: object)
(450.0, index                                                  951560
question    hi i'm ellie thanks for coming in today i was ...
answer      [okay, i, am, okay, sure, i, am, well, reasona...
t_answer    [64, 3, 6, 64, 97, 3, 6, 15, 2112, 15, 3, 6, 1...
Name: 450.0, dtype: object)
(452.0, index                                                  885891
question    hi i'm ellie thanks for coming in today i was ...
answer      [mm,

(481.0, index                                                 1197225
question    hi i'm ellie thanks for coming in today i was ...
answer      [yeah, um, good, moderate, los, angeles, yeah,...
t_answer    [11, 1, 16, 3087, 224, 238, 11, 3, 6, 1, 19, 3...
Name: 481.0, dtype: object)
(482.0, index                                                 1033972
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes, i, am, fine, born, texas, moved, califor...
t_answer    [39, 3, 6, 283, 336, 860, 245, 234, 138, 406, ...
Name: 482.0, dtype: object)
(483.0, index                                                 1239228
question    hi i'm ellie thanks for coming in today i was ...
answer      [sure, laughter, mm, good, guess, new, orleans...
t_answer    [97, 8, 33, 16, 35, 76, 1777, 2105, 2, 75, 418...
Name: 483.0, dtype: object)
(484.0, index                                                 1088738
question    hi i'm ellie thanks for coming in today i was ...
answer      [yes

In [22]:

phrases_lp.head()
phrases_lp.to_csv(data_path + 'phrases_lp.csv', sep='\t')
print("File was created")

NameError: name 'phrases_lp' is not defined

In [58]:
phrases_lp["t_answer"] = pad_sequences(phrases_lp["t_answer"], value=0, padding="post", maxlen=windows_size).tolist()
phrases_lp.head()

Unnamed: 0,personId,answer,t_answer
0,302.0,"[i, am, fine, i, am, los, angeles, california,...","[4, 6, 198, 4, 6, 234, 284, 152, 199, 67]"
1,302.0,"[am, fine, i, am, los, angeles, california, pa...","[6, 198, 4, 6, 234, 284, 152, 199, 67, 2]"
2,302.0,"[fine, i, am, los, angeles, california, part, ...","[198, 4, 6, 234, 284, 152, 199, 67, 2, 58]"
3,302.0,"[i, am, los, angeles, california, part, okay, ...","[4, 6, 234, 284, 152, 199, 67, 2, 58, 33]"
4,302.0,"[am, los, angeles, california, part, okay, um,...","[6, 234, 284, 152, 199, 67, 2, 58, 33, 828]"


In [59]:
phrases_lp_stop.to_csv(data_path + 'phrases_lp_stop.csv', sep='\t')
print("File was created")

File was created


In [60]:
phrases_lp = pd.read_csv(data_path + 'phrases_lp.csv', sep='\t', converters={"t_answer": literal_eval}) 
phrases_lp.head()

Unnamed: 0.1,Unnamed: 0,personId,answer,t_answer
0,0,302.0,"['i', 'am', 'fine', 'i', 'am', 'los', 'angeles...","[4, 6, 198, 4, 6, 234, 284, 152, 199, 67]"
1,1,302.0,"['am', 'fine', 'i', 'am', 'los', 'angeles', 'c...","[6, 198, 4, 6, 234, 284, 152, 199, 67, 2]"
2,2,302.0,"['fine', 'i', 'am', 'los', 'angeles', 'califor...","[198, 4, 6, 234, 284, 152, 199, 67, 2, 58]"
3,3,302.0,"['i', 'am', 'los', 'angeles', 'california', 'p...","[4, 6, 234, 284, 152, 199, 67, 2, 58, 33]"
4,4,302.0,"['am', 'los', 'angeles', 'california', 'part',...","[6, 234, 284, 152, 199, 67, 2, 58, 33, 828]"


In [61]:

def load_avec_dataset_file(path, score_column):
    ds = pd.read_csv(path, sep=',')
    ds['level'] = pd.cut(ds[score_column], bins=[-1,0,5,10,15,25], labels=[0,1,2,3,4])
    ds['PHQ8_Score'] = ds[score_column]
    ds['cat_level'] = keras.utils.to_categorical(ds['level'], num_classes).tolist()
    ds = ds[['Participant_ID', 'level', 'cat_level', 'PHQ8_Score']]
    ds = ds.astype({"Participant_ID": float, "level": int, 'PHQ8_Score': int})
    return ds

In [70]:
train = load_avec_dataset_file('E:/DIAC-WOZ/train_split_Depression_AVEC2017.csv', 'PHQ8_Score')
dev = load_avec_dataset_file('E:/DIAC-WOZ/dev_split_Depression_AVEC2017.csv', 'PHQ8_Score')
test = load_avec_dataset_file('E:/DIAC-WOZ/full_test_split.csv', 'PHQ_Score')
print("Size: train= {}, dev= {}, test= {}".format(len(train), len(dev), len(test)))
train.head()

Size: train= 107, dev= 35, test= 47


Unnamed: 0,Participant_ID,level,cat_level,PHQ8_Score
0,303.0,0,"[1.0, 0.0, 0.0, 0.0, 0.0]",0
1,304.0,2,"[0.0, 0.0, 1.0, 0.0, 0.0]",6
2,305.0,2,"[0.0, 0.0, 1.0, 0.0, 0.0]",7
3,310.0,1,"[0.0, 1.0, 0.0, 0.0, 0.0]",4
4,312.0,1,"[0.0, 1.0, 0.0, 0.0, 0.0]",2


In [71]:
ds_total = pd.concat([train,dev,test])
total_phq8 = len(ds_total)
print("Total size = {}".format(total_phq8))

Total size = 189
