In [1]:
! python3 -m pip install --upgrade pip
! python3 -m pip install --upgrade SpeechRecognition
! python3 -m pip install --upgrade pydub
! python3 -m pip install --upgrade sklearn
! python3 -m pip install --upgrade pylangacq

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 5.8MB/s 
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.3.1
    Uninstalling pip-19.3.1:
      Successfully uninstalled pip-19.3.1
Successfully installed pip-21.0.1
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 66 kB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting pylangacq
  Downloading pylangacq-0.13.1-py3-none-any.whl (66 kB)
[K     |████████████████████████████████| 66 kB 2.6 MB/s 
Installing collected packages: pyl

In [2]:
import IPython.display as ipd
import librosa
import librosa.display
import kapre
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import io
import os
import pandas as pd
import random
import re
import shutil
import speech_recognition as sr
import tensorflow as tf
from os import listdir
from os.path import isfile, join
from os import path
from plotnine import *
from pydub import AudioSegment
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from pydub.silence import split_on_silence 

# Cloud storage
from google.cloud import storage

# .cha file reader
import pylangacq

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Transcribe audio data to txt data

In [None]:
# !ls '/content/drive/My Drive/Berkeley/W210/Capstone/ADReSS-IS2020-data/train/Full_wave_enhanced_audio/cd/'

In [4]:
# initialize the speech recognizer
r = sr.Recognizer()

# a function that splits the audio file into chunks 
# and applies speech recognition 
def silence_based_conversion(path, wav_file): 
    text_df = pd.DataFrame()
  
    # open the audio file stored in 
    # the local system as a wav file. 
    song = AudioSegment.from_wav(path + wav_file) 
    print(song)
  
    # open a file where we will concatenate   
    # and store the recognized text 
    text_file = wav_file.partition('.')[0] + ".txt"
    text_file_dir = path + "output_text/" 
    if not os.path.exists(text_file_dir):
        os.makedirs(text_file_dir)    
    text_file_path = text_file_dir + text_file
    fh = open(text_file_path, "w+") 
          

    dBFS = song.dBFS
    print('dBFS: ' + str(dBFS))
    # chunks = split_on_silence(song, 
    #     min_silence_len = 500,
    #     silence_thresh = dBFS-16,
    #     keep_silence = 250 
    # )

    # split track where silence is 0.5 seconds  
    # or more and get chunks 
    chunks = split_on_silence(song, 
        # must be silent for at least 0.5 seconds 
        # or 500 ms. adjust this value based on user 
        # requirement. if the speaker stays silent for  
        # longer, increase this value. else, decrease it. 
        min_silence_len = 250, 
  
        # consider it silent if quieter than -16 dBFS 
        # adjust this per requirement 
        silence_thresh = dBFS - 16
        # keep_silence = 250
    ) 
    # setting minimum length of each chunk to 25 seconds
    target_length = 20 * 1000 
    output_chunks = [chunks[0]]
    for chunk in chunks[1:]:
      print('Length of chunk: ' + str(len(output_chunks[-1])) )
      if len(output_chunks[-1]) < target_length:
        output_chunks[-1] += chunk
      else:
        # if the last output chunk is longer than the target length,
        # we can start a new one
        output_chunks.append(chunk)    
    # print(chunks)
  
    # create a directory to store the audio chunks. 
    try: 
        os.mkdir(path + 'audio_chunks') 
    except(FileExistsError): 
        pass
  
    # move into the directory to 
    # store the audio files. 
    os.chdir(path +'audio_chunks') 
  
    i = 0
    # process each chunk 
    for chunk in output_chunks: 
              
        # Create 0.5 seconds silence chunk 
        chunk_silent = AudioSegment.silent(duration = 10) 
  
        # add 0.5 sec silence to beginning and  
        # end of audio chunk. This is done so that 
        # it doesn't seem abruptly sliced. 
        audio_chunk = chunk_silent + chunk + chunk_silent 
  
        # export audio chunk and save it in  
        # the current directory. 
        text_file_id = text_file.partition('.')[0]
        chunk_file_name = text_file_id + "_" + "chunk" + str(i) + ".wav"
        print("saving " + chunk_file_name) 
        # specify the bitrate to be 192 k 
        audio_chunk.export("./" + chunk_file_name, bitrate ='192k', format ="wav") 
  
        # the name of the newly created chunk 
        filename = chunk_file_name
  
        print("Processing chunk file: " + filename) 
  
        # get the name of the newly created chunk 
        # in the AUDIO_FILE variable for later use. 
        file = filename 
  
        # create a speech recognition object 
        r = sr.Recognizer() 
  
        # recognize the chunk 
        with sr.AudioFile(file) as source: 
            # remove this if it is not working 
            # correctly. 
            #r.adjust_for_ambient_noise(source) 
            audio_listened = r.record(source) 
  
        try: 
            # try converting it to text 
            rec = r.recognize_google(audio_listened) 
            # write the output to the file. 
            fh.write(rec+". ") 
            text_df = text_df.append({'ID': text_file_id, 'Text': rec}, ignore_index = True)
  
        # catch any errors. 
        except sr.UnknownValueError: 
            print("Could not understand audio") 
  
        except sr.RequestError as e: 
            print("Could not request results. check your internet connection") 
  
        i += 1
    return text_df

# silence_based_conversion(audio_path_cc, 'S001.wav')        
# text_df = silence_based_conversion(audio_path_cd, 'S079.wav')        
# print(text_df)
# silence_based_conversion(audio_path_cc + 'spkr0.wav')        

## Read csv data file with ID, TEXT and MMSE scores to be used for training and validation

# BERT Transfer Learning

In [5]:
!pip install transformers==3.0.2

Collecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[K     |████████████████████████████████| 769 kB 4.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 12.5 MB/s 
Collecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 18.0 MB/s 
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 37.2 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-py3-none-any.whl size=893258 sha256=19247f52ac3e3880421df9df3eab561381aef116b92c71985dad59e43d1ca505
  Stored in directory: /root/.cache/pip/wheels/69/09/d1/bf058f7d6fa0ecba2ce7c66be3b8d012beb4bf

In [6]:
from transformers import BertTokenizer
bert_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_name,
                                          add_special_tokens=True,
                                          do_lower_case=True,
                                          max_length=256,
                                          pad_to_max_length=True)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
def bert_encoder(input_text):
    # txt = input_text.numpy().decode('utf-8')
    txt = input_text
    encoded = tokenizer.encode_plus(txt, add_special_tokens=True, 
                                    max_length=256, 
                                    pad_to_max_length=True, 
                                    return_attention_mask=True, 
                                    return_token_type_ids=True,
                                    truncation=True)
    return encoded['input_ids'], encoded['token_type_ids'], \
           encoded['attention_mask']

In [8]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
  return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y


In [11]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='relu')(cls_token)
#     run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
    model = Model(inputs=input_word_ids, outputs=out)
    for layer in model.layers[:-1]:
      layer.trainable = False
    # model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy']) #, options = run_opts)
    # optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=2e-5)
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    model.compile(optimizer = optimizer, 
                  loss=tf.keras.metrics.mean_squared_error,
                  metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')])
    return model

In [12]:
import transformers
MAX_LEN = 256
# with strategy.scope():
def create_model():
  transformer_layer = (
      transformers.TFAutoModelWithLMHead.from_pretrained(bert_name)
  )
  model = build_model(transformer_layer, max_len=MAX_LEN)
  return model


## Reloaded model for predictions

In [13]:
checkpoint_path = "/content/drive/My Drive/Berkeley/W210/Capstone/BERT_Model_Transcription/training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

os.listdir(checkpoint_dir)

['cp.ckpt.index', 'checkpoint', 'cp.ckpt.data-00000-of-00001']

In [14]:
# Loads the weights
reloaded_model = create_model()
reloaded_model.load_weights(checkpoint_path)
reloaded_model.summary()
# result = reloaded_model.predict(train_ds)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertForMaskedLM: ['nsp___cls']
- This IS expected if you are initializing TFBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 256)]             0         
_________________________________________________________________
tf_bert_for_masked_lm (TFBer ((None, 256, 30522),)     110104890 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 30522)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 30523     
Total params: 110,135,413
Trainable params: 30,523
Non-trainable params: 110,104,890
_________________________________________________________________


In [15]:
data_path = '/content/drive/My Drive/Berkeley/W210/Capstone/ADReSS-IS2020-data/train'

transcription_combination_txt_file_path_orig = data_path + "/transcription_original_text/" + "transcription_combination_id_txt_mmse.csv"

In [16]:
filtered_merged_data_orig = pd.read_csv(transcription_combination_txt_file_path_orig)
filtered_merged_data_orig.tail()

Unnamed: 0.1,Unnamed: 0,ID,Age,Gender,TEXT,MMSE,Group
103,103,S150,58,male,"well the boy on the stool is falling , reachi...",20,cd
104,104,S151,72,male,the boy and the girl are playing and he's gon...,24,cd
105,105,S153,68,female,oh you want me to on that ? oh okay looks lik...,12,cd
106,106,S154,65,female,you want me to tell you ? okay the boy's gett...,20,cd
107,107,S156,71,female,mhm . well this one is in the cookie jar . an...,13,cd


In [17]:
bert_predict_data_orig = [bert_encoder(r) for r in filtered_merged_data_orig["TEXT"]]
bert_predict_lbl_orig = filtered_merged_data_orig["MMSE"]
bert_predict_data_orig = np.array(bert_predict_data_orig)

In [18]:
print(bert_predict_data_orig.shape)
print(bert_predict_lbl_orig.shape)

(108, 3, 256)
(108,)


In [19]:
predict_reviews, predict_segments, predict_masks = np.split(bert_predict_data_orig, 3, axis=1)
predict_reviews = predict_reviews.squeeze()
predict_segments = predict_segments.squeeze()
predict_masks = predict_masks.squeeze()

predict_ds = tf.data.Dataset.from_tensor_slices((predict_reviews, 
predict_masks, predict_segments, bert_predict_lbl_orig)).\
            map(example_to_features).shuffle(100).batch(16)

In [20]:
predict_result = reloaded_model.predict(predict_ds)
predict_result

  [n for n in tensors.keys() if n not in ref_input_names])


array([[17.417488],
       [21.89738 ],
       [26.807043],
       [28.43382 ],
       [32.202515],
       [ 9.362244],
       [26.134857],
       [23.153185],
       [28.120903],
       [27.75021 ],
       [31.104382],
       [25.091267],
       [26.818472],
       [21.490232],
       [30.082336],
       [26.92623 ],
       [19.931509],
       [29.068512],
       [28.223349],
       [27.588337],
       [26.826227],
       [28.70575 ],
       [25.580347],
       [30.214148],
       [32.741512],
       [23.949755],
       [23.338007],
       [14.010484],
       [25.996477],
       [27.045008],
       [27.748589],
       [24.551346],
       [20.31096 ],
       [27.97299 ],
       [25.529749],
       [28.09269 ],
       [29.775274],
       [26.120173],
       [29.612587],
       [26.048782],
       [16.683575],
       [26.564142],
       [29.688723],
       [27.377878],
       [23.187849],
       [11.208713],
       [21.238136],
       [29.878986],
       [27.78851 ],
       [27.068914],


In [None]:
print(type(predict_result))

<class 'numpy.ndarray'>


In [21]:
filtered_merged_data_orig['predict'] = predict_result

In [22]:
filtered_merged_data_orig.head()

Unnamed: 0.1,Unnamed: 0,ID,Age,Gender,TEXT,MMSE,Group,predict
0,0,S001,74,male,well there's a mother standing there washing ...,30,cc,17.417488
1,1,S002,62,female,somebody's getting cookies out_of the cookie ...,30,cc,21.897381
2,2,S003,69,female,okay . there's a little boy and he's standing...,29,cc,26.807043
3,3,S004,71,female,are you ready ? well the sink is overflowing ...,30,cc,28.433821
4,4,S005,74,female,okay . the mother's washing the dishes and th...,30,cc,32.202515


In [24]:
filtered_merged_data_orig.loc[filtered_merged_data_orig['predict'] > 30, 'predict'] = 30
filtered_merged_data_orig.head()

Unnamed: 0.1,Unnamed: 0,ID,Age,Gender,TEXT,MMSE,Group,predict
0,0,S001,74,male,well there's a mother standing there washing ...,30,cc,17.417488
1,1,S002,62,female,somebody's getting cookies out_of the cookie ...,30,cc,21.897381
2,2,S003,69,female,okay . there's a little boy and he's standing...,29,cc,26.807043
3,3,S004,71,female,are you ready ? well the sink is overflowing ...,30,cc,28.433821
4,4,S005,74,female,okay . the mother's washing the dishes and th...,30,cc,30.0


In [25]:
filtered_merged_data_orig.loc[filtered_merged_data_orig['predict'] >= 24, 'predict_group'] = 'cc'
filtered_merged_data_orig.loc[filtered_merged_data_orig['predict'] < 24, 'predict_group'] = 'cd'
filtered_merged_data_orig.head()

Unnamed: 0.1,Unnamed: 0,ID,Age,Gender,TEXT,MMSE,Group,predict,predict_group
0,0,S001,74,male,well there's a mother standing there washing ...,30,cc,17.417488,cd
1,1,S002,62,female,somebody's getting cookies out_of the cookie ...,30,cc,21.897381,cd
2,2,S003,69,female,okay . there's a little boy and he's standing...,29,cc,26.807043,cc
3,3,S004,71,female,are you ready ? well the sink is overflowing ...,30,cc,28.433821,cc
4,4,S005,74,female,okay . the mother's washing the dishes and th...,30,cc,30.0,cc


In [29]:
predict_result_df = filtered_merged_data_orig[['ID', 'Age', 'Gender', 'MMSE', 'Group', 'predict', 'predict_group']]
print(predict_result_df.head())
predict_result_df.to_csv(data_path + "/prediction_results.csv")

     ID  Age    Gender  MMSE Group    predict predict_group
0  S001   74     male     30    cc  17.417488            cd
1  S002   62   female     30    cc  21.897381            cd
2  S003   69   female     29    cc  26.807043            cc
3  S004   71   female     30    cc  28.433821            cc
4  S005   74   female     30    cc  30.000000            cc
