In [0]:
# CONNECT TO GOOGLE DRIVE

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# TRIM THE DATA TO THE RELEVANT COLUMNS

# import pandas and numpy libraries
import pandas as pd
import numpy as np
import os

# read in the convo transcription file as a dataframe
# this file should have simplified roles and cleaned text
df = pd.read_csv('/content/gdrive/My Drive/Conversation Data Vis/Brown Annotated Convo Data/For LightSide/Cardiology1Clean_v2.csv')

# define main file path
annPath = '/content/gdrive/My Drive/Conversation Data Vis/Brown Annotated Convo Data/OLIVIA/formattingdata/ann_convert/'

# trim the data to only include relevant columns
sa = df['Speech act']
text = df[['ID', 'CleanText']]
df = df[['ID','CleanText','Speech act','Induction','Topic code']]

# add empty column for saving index later
df['indexx'] = ''

# save the trimmed data as a csv file in the drive for future reference
df.to_csv(annPath + 'trimmedDF.csv', index = False)
text.to_csv(annPath + 'justText.csv', index = False)

In [0]:
# EXTRACT QUESTION SPEECH ACT INFORMATION

# create new empty datafram for question speech act info
qBasic = pd.DataFrame(columns=['Index-saved','Text','Q_Code','Q_Type','Patient_ID', 'indexx'])

# evaluate the to_numpy array here, so as to not keep evaluating it in the for loop
val = df.to_numpy()

# initialize indexx to 0
indexx = 0

# iterate through each row in df
for index, row in df.iterrows():
  
  # restart the index every time a new patient convo begins
  if (row[0] != val[index - 1,0]):
    indexx = 0
  
  # remove trailing zeros from speech act code
  form = '{0:g}'.format(row['Speech act'])
  
  # designate labels for open vs. closed questions based on speech act code and write to qBasic dataframe
  if form in ['1.11', '1.21']:
    qBasic = qBasic.append({'Index-saved':index,'Text':row['CleanText'],'Q_Code': form,'Q_Type':'open','Patient_ID': val[index,0][0:9] + val[index,0][10] + val[index,0][12],'indexx': indexx}, ignore_index = True)
  elif form in ['1.12','1.121','1.122','1,22','1.221','1.222','1.3','1.31','1.32']:
    qBasic = qBasic.append({'Index-saved':index,'Text':row['CleanText'],'Q_Code': form,'Q_Type':'closed','Patient_ID': val[index,0][0:9] + val[index,0][10] + val[index,0][12],'indexx': indexx}, ignore_index = True)
  
  # update index
  indexx += 1

# save whole dataframe as new file
qBasic.to_csv(annPath + 'allQ.csv', index = False)

In [0]:
# SPLIT THE QUESTION DATA BY PATIENT

# get row count for use in forloop
qrowCount = qBasic.shape[0]

# create new temporary empty dataframe
splitQ = pd.DataFrame(columns=['Index-saved','Text','Q_Code','Q_Type','Patient_ID','indexx'])

# evaluate the to_numpy array here, so as to not keep evaluating it in the for loop
qval = qBasic.to_numpy()

# loop over each row in the data
for i in range(0, qrowCount):
  
  # need to append data to splitQ regardless of row or patient
  splitQ = splitQ.append({'Index-saved': qval[i,0],'Text': qval[i,1],'Q_Code': qval[i,2],'Q_Type': qval[i,3],'Patient_ID': qval[i,4],'indexx': qval[i,5]}, ignore_index = True)
  
  # if a new patient convo is begging, then split and save to new file
  if (i == qrowCount - 1 or qval[i,4] != qval[i+1,4]):
   
    # assign filename
    qfilename = 'Q_' + qval[i,4]
    
    # export new text
    splitQ.to_csv(annPath + 'splitQ/' + qfilename +'.csv', index = False)

    # clear the buffer dataframe
    splitQ = pd.DataFrame(columns=['Index-saved','Text','Q_Code','Q_Type','Patient_ID','indexx'])

In [0]:
# SPLIT THE TEXT BY PATIENT

# remember that the df called "text" is the df with just the patient ID and cleaned txt.

# get row count for use in forloop
rowCount = text.shape[0]

# create new temporary empty dataframe with only text
newText = pd.DataFrame(columns=['Text'])

# initialize list for storing file names
fileNames = []

# evaluate the to_numpy array here, so as to not keep evaluating it in the for loop
textval = text.to_numpy()

# loop over each row in the data
for i in range(0, rowCount):
  
  # need to append data to splitQ regardless of row or patient
  newText = newText.append({'Text': textval[i,1]}, ignore_index = True)
  
  # code for splitting by patient
  if (i == rowCount - 1 or textval[i,0] != textval[i+1,0]):
    
    # assign filename
    filename = textval[i,0][0:9] + textval[i,0][10] + textval[i,0][12]
    
    # export new text
    newText.to_csv(annPath + 'justText/' + filename + '.txt', index = False, header = False)
    
    # clear the buffer dataframe
    newText = pd.DataFrame(columns=['Text'])
    
    # store filenames
    fileNames.append(filename)

In [0]:
# CREATE .ANN FILES

# initialize line index
lineIndex = 0

# iterate through each file and open relevant files
for file in fileNames:
  f = open(annPath + 'justText/' + file + '.txt', 'r')
  noQuotes = open(annPath +'noQuotes/nq' + file + '.txt', 'w+')
  annForm = open(annPath + 'ann_form/ann' + file + '.ann', 'w+')
  
  bratTxt = open(annPath + 'brat_upload/brat' + file + '.txt', 'w+')
  bratForm = open(annPath + 'brat_upload/brat' + file + '.ann', 'w+')
  
  questions = pd.read_csv(annPath + 'splitQ/Q_' + file + '.csv')
  questionsVal = questions.to_numpy()

  # initialize beginning variable and T_num (T# in .ann file)
  beg = 0
  T_num = 1
  
  # evaluate index value here, so as to not keep evaluating it in the for loop
  indexxVal = questions.iloc[:,5].values
  
  # iterate through each line in the file
  for line in f:
    
    # get rid of quotation marks (which were automatically inputted due to .csv format) and save new files in the drive
    nqline = line.replace('"','')
    noQuotes.write(nqline)
    bratTxt.write(nqline)

    # define end variable
    end = beg + len(nqline) - 1
    
    # write in .ann format
    if lineIndex in indexxVal:
      annForm.write('T{}\t{} {} {}\t{}\n'.format(T_num, questions.loc[questions['indexx'] == lineIndex].Q_Type.values[0], beg, end, nqline))
      bratForm.write('T{}\t{} {} {}\t{}\n'.format(T_num, questions.loc[questions['indexx'] == lineIndex].Q_Type.values[0], beg, end, nqline))
      T_num += 1

    # update beg and lineIndex
    beg = end + 1
    lineIndex += 1
  
  # reset lineIndex for next file
  lineIndex = 0
      
  # close docs
  f.close()
  noQuotes.close()
  annForm.close()
  bratTxt.close()
  bratForm.close()