In [54]:
! python3 -m pip install --upgrade pip
! python3 -m pip install --upgrade pylangacq



In [55]:

# Python packages
import io
import os
import pandas as pd
import random
from os import listdir
from os.path import isfile, join
from os import path

# Cloud storage
from google.cloud import storage

# .cha file reader
import pylangacq

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
data_path = '/content/drive/My Drive/Berkeley/W210/Capstone/ADReSS-IS2020-data/train'
text_path_cc = data_path + '/transcription/cc/'

## Define all the file path variables

In [58]:
# TODO make an input from user
# data_path = '/tf/dementia/0extra/ADReSS-IS2020-train/ADReSS-IS2020-data/train'
data_path = '/content/drive/My Drive/Berkeley/W210/Capstone/ADReSS-IS2020-data/train'

audio_path_cc = data_path + '/Full_wave_enhanced_audio/cc/'
audio_path_cd = data_path + '/Full_wave_enhanced_audio/cd/'


text_path_cc = data_path + '/transcription/cc/'
text_path_cd = data_path + '/transcription/cd/'

lang_ = 'en-US'



## Convert .cha transcriptions to text file by selecting only the participants part of the text

In [59]:
def convertChaToTextFormat(input_file_dir, output_file_dir, combination_count = 1):
  df_id_txt = pd.DataFrame(columns=['ID', 'TEXT'])

  # If the output directory doesn't exists, create one
  if not os.path.exists(output_file_dir):
      os.makedirs(output_file_dir)    

  onlyfiles = [f for f in sorted(listdir(input_file_dir)) if isfile(join(input_file_dir, f))]
  # print(len(onlyfiles))

  for filename in onlyfiles:
    file_path = input_file_dir + "/" + filename
    # print(file_path)
    reader = pylangacq.read_chat(file_path)
    reader.participants()
    sentence_words = reader.words(participants="PAR", by_utterances=True)
    file_name_prefix = filename.partition('.')[0]

    max_combination_count = combination_count
    if len(sentence_words) < max_combination_count:
      max_combination_count = len(sentence_words)

    # create combinations of sentences using random shuffling of the list
    for count in range(0, max_combination_count):
      random.shuffle(sentence_words)
      final_text = ''
      for sentence in sentence_words:
        for word in sentence:
          final_text = final_text + ' ' + word

      final_text_modified = final_text.replace(". and", "and")
      final_text_modified = final_text_modified.replace(" . ", ".")

      output_text_file = file_name_prefix + "_" + str(count) + ".txt"
      output_text_file_path = output_file_dir + output_text_file
      fh = open(output_text_file_path, "w+") 
      fh.write(final_text)
      df_id_txt.loc[len(df_id_txt.index)] = [file_name_prefix, final_text]

  return df_id_txt

df_id_txt_cc = convertChaToTextFormat(text_path_cc, data_path + "/transcription_shuffled_text/", combination_count=20)
df_id_txt_cd = convertChaToTextFormat(text_path_cd, data_path + "/transcription_shuffled_text/", combination_count=20)
df_id_txt = pd.DataFrame()
df_id_txt = df_id_txt.append(df_id_txt_cc)
df_id_txt = df_id_txt.append(df_id_txt_cd)
df_id_txt = df_id_txt[["ID", "TEXT"]]
df_id_txt = df_id_txt.reset_index()

df_id_txt.tail()


Unnamed: 0,index,ID,TEXT
1395,677,S156,and over here must be the mother . I don't kn...
1396,678,S156,this boy tried to get in the cookie jar . wha...
1397,679,S156,and she tried to climb the ... she was doing ...
1398,680,S156,well this one is in the cookie jar . what's g...
1399,681,S156,and over here must be the mother . what's goi...


# Merge meta data to txt data by ID and update the MMSE score

In [60]:
meta_cc = pd.read_csv(data_path + '/cc_meta_data.txt', sep=";", header=0, 
                      names = ['ID', 'Age', 'Gender', 'MMSE'])
meta_cd = pd.read_csv(data_path + '/cd_meta_data.txt', sep=";", header=0, 
                      names = ['ID', 'Age', 'Gender', 'MMSE'])

meta = meta_cc.assign(Group = 'cc').append(meta_cd.assign(Group = 'cd')).reset_index()

meta['ID'] = meta['ID'].str.strip()

meta

Unnamed: 0,index,ID,Age,Gender,MMSE,Group
0,0,S001,74,male,,cc
1,1,S002,62,female,30,cc
2,2,S003,69,female,29,cc
3,3,S004,71,female,30,cc
4,4,S005,74,female,30,cc
...,...,...,...,...,...,...
103,49,S150,58,male,20,cd
104,50,S151,72,male,24,cd
105,51,S153,68,female,12,cd
106,52,S154,65,female,20,cd


In [61]:
sum(meta.MMSE == ' NA')
meta.MMSE = pd.to_numeric(meta.MMSE.replace(' NA', 30))

In [62]:
merged_data = pd.merge(meta, df_id_txt, on = "ID", how = "inner")
merged_data

Unnamed: 0,index_x,ID,Age,Gender,MMSE,Group,index_y,TEXT
0,0,S001,74,male,30,cc,0,she seems to be oblivious to the fact that th...
1,0,S001,74,male,30,cc,1,he's standing up there in the cupboard taking...
2,0,S001,74,male,30,cc,2,and the kid on the stool is gonna fall off th...
3,0,S001,74,male,30,cc,3,the kids are somewhere around seven or eight ...
4,0,S001,74,male,30,cc,4,looks like a garage or something with curtain...
...,...,...,...,...,...,...,...,...
1395,53,S156,71,female,13,cd,677,and over here must be the mother . I don't kn...
1396,53,S156,71,female,13,cd,678,this boy tried to get in the cookie jar . wha...
1397,53,S156,71,female,13,cd,679,and she tried to climb the ... she was doing ...
1398,53,S156,71,female,13,cd,680,well this one is in the cookie jar . what's g...


In [63]:

filtered_merged_data = merged_data[merged_data['TEXT'].apply(lambda x: len(x) > 0)]
filtered_merged_data = filtered_merged_data[["ID", "TEXT", "MMSE"]]
filtered_merged_data


Unnamed: 0,ID,TEXT,MMSE
0,S001,she seems to be oblivious to the fact that th...,30
1,S001,he's standing up there in the cupboard taking...,30
2,S001,and the kid on the stool is gonna fall off th...,30
3,S001,the kids are somewhere around seven or eight ...,30
4,S001,looks like a garage or something with curtain...,30
...,...,...,...
1395,S156,and over here must be the mother . I don't kn...,13
1396,S156,this boy tried to get in the cookie jar . wha...,13
1397,S156,and she tried to climb the ... she was doing ...,13
1398,S156,well this one is in the cookie jar . what's g...,13


In [64]:
transcription_combination_txt_file_path = data_path + "/transcription_shuffled_text/" + "transcription_combination_id_txt_mmse.csv"

In [65]:
filtered_merged_data.to_csv(transcription_combination_txt_file_path)

In [66]:
filtered_merged_data = pd.read_csv(transcription_combination_txt_file_path)
filtered_merged_data.tail()

Unnamed: 0.1,Unnamed: 0,ID,TEXT,MMSE
1395,1395,S156,and over here must be the mother . I don't kn...,13
1396,1396,S156,this boy tried to get in the cookie jar . wha...,13
1397,1397,S156,and she tried to climb the ... she was doing ...,13
1398,1398,S156,well this one is in the cookie jar . what's g...,13
1399,1399,S156,and over here must be the mother . what's goi...,13
