### Import Necessary Libraries

In [22]:
# data handling
import pandas as pd

# pre-processing pipeline
import nltk

# Chunker
from flair.data import Sentence
from flair.models import SequenceTagger

# topic modelling
from bertopic import BERTopic

### Load Data

In [57]:
data = pd.read_csv('../data/Defined_AI/DAI-healthcare-retail.tsv', sep='\t')

In [58]:
data

Unnamed: 0,TranscriptionId,Channel,StartTime,EndTime,Transcription,SegmentDuration,RecordingId,Domain,SampleRate,BitDepth,...,LeftChannelSpeakerAge,LeftChannelSpeakerLivingCountry,LeftChannelSpeakerAccent,LeftChannelNative,RightChannelSpeakerId,RightChannelSpeakerGender,RightChannelSpeakerAge,RightChannelSpeakerLivingCountry,RightChannelSpeakerAccent,RightChannelNative
0,596305,1,0:00:00.000,0:00:08.739,"[n_s/] Hello there My name is ""Anne"" and thank...",0:00:08.739,1ff026c2-9fd0-4a7e-a4cd-8ea4268f3d23,healthcare-retail,8000,16,...,45,United States,California,True,35170AE5-660E-43B2-84C9-046EA9E88E74,Female,39,United States,Arizona,True
1,596306,2,0:00:09.600,0:00:15.823,Hi I was wondering to see if you could give me...,0:00:06.223,1ff026c2-9fd0-4a7e-a4cd-8ea4268f3d23,healthcare-retail,8000,16,...,45,United States,California,True,35170AE5-660E-43B2-84C9-046EA9E88E74,Female,39,United States,Arizona,True
2,596307,1,0:00:16.890,0:00:23.146,Oh absolutely I'd be happy to help [filler/] W...,0:00:06.256,1ff026c2-9fd0-4a7e-a4cd-8ea4268f3d23,healthcare-retail,8000,16,...,45,United States,California,True,35170AE5-660E-43B2-84C9-046EA9E88E74,Female,39,United States,Arizona,True
3,596308,2,0:00:24.176,0:00:29.207,I'm having cold and tingling in my [filler/] u...,0:00:05.031,1ff026c2-9fd0-4a7e-a4cd-8ea4268f3d23,healthcare-retail,8000,16,...,45,United States,California,True,35170AE5-660E-43B2-84C9-046EA9E88E74,Female,39,United States,Arizona,True
4,596309,1,0:00:30.734,0:00:45.385,oh dear I'm so sorry to hear that so you're sa...,0:00:14.651,1ff026c2-9fd0-4a7e-a4cd-8ea4268f3d23,healthcare-retail,8000,16,...,45,United States,California,True,35170AE5-660E-43B2-84C9-046EA9E88E74,Female,39,United States,Arizona,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6404,596126,2,0:04:02.056,0:04:06.663,[filler/] that's [n_s/] that's great so [n_s/]...,0:00:04.607,feddaf24-2b8d-4474-a089-5ce192e74f18,healthcare-retail,8000,16,...,38,United States,Texas,True,4FD155E2-679D-45D2-98C5-1A713AD828F4,Female,68,United States,Pennsylvania,True
6405,596127,1,0:04:07.314,0:04:14.296,[filler/] once a verified [filler/] you will g...,0:00:06.982,feddaf24-2b8d-4474-a089-5ce192e74f18,healthcare-retail,8000,16,...,38,United States,Texas,True,4FD155E2-679D-45D2-98C5-1A713AD828F4,Female,68,United States,Pennsylvania,True
6406,596128,2,0:04:15.692,0:04:21.184,okay well thank you so much [filler/] this pha...,0:00:05.492,feddaf24-2b8d-4474-a089-5ce192e74f18,healthcare-retail,8000,16,...,38,United States,Texas,True,4FD155E2-679D-45D2-98C5-1A713AD828F4,Female,68,United States,Pennsylvania,True
6407,596129,1,0:04:21.901,0:04:25.484,You're very welcome and thank you for calling ...,0:00:03.583,feddaf24-2b8d-4474-a089-5ce192e74f18,healthcare-retail,8000,16,...,38,United States,Texas,True,4FD155E2-679D-45D2-98C5-1A713AD828F4,Female,68,United States,Pennsylvania,True


### Data Cleaning

Before, attempting to clean the data we must take a look at the data.

In [59]:
data.Transcription.to_list()[:100]

['[n_s/] Hello there My name is "Anne" and thank you so much for calling us here at "Bartell Drugs" How may I help you today',
 "Hi I was wondering to see if you could give me help finding an over the counter medication for some symptoms that I'm having",
 "Oh absolutely I'd be happy to help [filler/] What particular symptoms are you experiencing",
 "I'm having cold and tingling in my [filler/] upper leg area",
 "oh dear I'm so sorry to hear that so you're saying you're having cold and tingling sensation and your upper leg area [n_s/] okay so [filler/] have you discussed this issue yet with your doctor",
 "No I haven't yet kind of hard to get a hold of them",
 "Oh yeah I understand Yes assume and how often you're having the symptoms in your leg",
 "[filler/] I'd say like every other day usually in the morning [n_s/]",
 '[n_s/] oh Okay I understand [filler/] may I also ask do you have a history of common diseases such as a heart disease diabetes or cancer',
 '[filler/] Yeah I actually D

The data is already in lower case. We will proceed to some preprocessing steps to help represent the text appropriately.

### Data Pre-Processing

#### Chunking

We will use a chunker to chunk up the texts into smaller sections.

In [6]:
# load tagger
tagger = SequenceTagger.load("flair/chunk-english")

2023-06-21 18:42:41,919 SequenceTagger predicts: Dictionary with 47 tags: O, S-NP, B-NP, E-NP, I-NP, S-VP, B-VP, E-VP, I-VP, S-PP, B-PP, E-PP, I-PP, S-ADVP, B-ADVP, E-ADVP, I-ADVP, S-SBAR, B-SBAR, E-SBAR, I-SBAR, S-ADJP, B-ADJP, E-ADJP, I-ADJP, S-PRT, B-PRT, E-PRT, I-PRT, S-CONJP, B-CONJP, E-CONJP, I-CONJP, S-INTJ, B-INTJ, E-INTJ, I-INTJ, S-LST, B-LST, E-LST, I-LST, S-UCP, B-UCP, E-UCP, I-UCP, <START>, <STOP>


In [7]:
tagger

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4096, out_features=4096, bias=True)
  (rnn): LSTM(4096, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=47, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)

A quick overview of how it works on our text corpus sentences.

In [9]:
# make example sentence
sentences = [Sentence(sent) for sent in data.text.to_list()[:10]]

# predict NER tags
tagger.predict(sentences)

# print predicted NER spans
for sentence in sentences:
  print()
  print('The following chunks are found in:', sentence.text)
  print()
  # iterate over entities and print
  for entity in sentence.get_spans('np'):
      print(entity)


The following chunks are found in: hi how are you thank you for calling chase bank how can i help you

Span[1:2]: "how" → ADVP (0.9997)
Span[3:4]: "you" → NP (0.9999)
Span[4:5]: "thank" → VP (0.9998)
Span[5:6]: "you" → NP (0.9999)
Span[6:7]: "for" → PP (0.9999)
Span[7:8]: "calling" → VP (1.0)
Span[8:10]: "chase bank" → NP (0.8101)
Span[10:11]: "how" → ADVP (0.9999)
Span[12:13]: "i" → NP (0.9999)
Span[13:14]: "help" → VP (0.9996)
Span[14:15]: "you" → NP (0.9999)

The following chunks are found in: that's really nice i can definitely help you with that can i have your full name

Span[0:1]: "that" → NP (1.0)
Span[1:2]: "'s" → VP (1.0)
Span[2:4]: "really nice" → ADJP (0.8709)
Span[4:5]: "i" → NP (0.9994)
Span[5:8]: "can definitely help" → VP (0.998)
Span[8:9]: "you" → NP (1.0)
Span[9:10]: "with" → PP (1.0)
Span[10:11]: "that" → NP (0.9846)
Span[12:13]: "i" → NP (1.0)
Span[13:14]: "have" → VP (1.0)
Span[14:17]: "your full name" → NP (0.9976)

The following chunks are found in: missus miss 

Nah, the chunk qualities aren't good.

#### Topic Modelling

We will use words instead of entire sentences.

In [60]:
# we don't want our topic model be affected by stop words
stopwords = set(nltk.corpus.stopwords.words('english'))

In [61]:
docs = list(set(
    [word for text in data.Transcription for word in str(text).split() if word not in stopwords]
))

In [62]:
len(docs)

4167

In [63]:
topic_model = BERTopic(verbose=True, nr_topics='auto')
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

2023-06-21 19:32:49,528 - BERTopic - Transformed documents to Embeddings
2023-06-21 19:32:51,660 - BERTopic - Reduced dimensionality
2023-06-21 19:32:51,736 - BERTopic - Clustered reduced embeddings
2023-06-21 19:32:51,805 - BERTopic - Reduced number of topics from 102 to 28


In [64]:
topic_model.visualize_barchart(top_n_topics=26)

In [33]:
topic_model.visualize_documents(docs)

Let's try converting a text from our corpus to representative topics.

In [65]:
text = data.Transcription.to_list()[-100]
text

'[filler/] we have I have high cholesterol and [filler/] one of my family members has had diebetes'

In [66]:
# this is the view the topic model will be provided
topic_model_view = [word for word in text.split() if word not in stopwords]
topic_model_view

['[filler/]',
 'I',
 'high',
 'cholesterol',
 '[filler/]',
 'one',
 'family',
 'members',
 'diebetes']

In [67]:
topic_view = [topic_model.find_topics(word, top_n=5) for word in topic_model_view]

In [68]:
text_topic_rep = [[topic_model.get_topic(topic_id) for topic_id in topics[0]] for topics in topic_view]

In [70]:
print('Text from corpus:', text)
print('Stopwords removed:', topic_model_view)
print('Topics Cluster for this text:')
print()
for word, topics in zip(topic_model_view, text_topic_rep):
    print(word, ':')
    for topic in topics:
        print('-', [topic_word[0] for topic_word in topic])
    print()


Text from corpus: [filler/] we have I have high cholesterol and [filler/] one of my family members has had diebetes
Stopwords removed: ['[filler/]', 'I', 'high', 'cholesterol', '[filler/]', 'one', 'family', 'members', 'diebetes']
Topics Cluster for this text:

[filler/] :
- ['riteaid', 'allegra', 'clarinet', 'aid', 'walgreens', 'george', 'smith', 'god', 'daniels', 'am']
- ['jones', 'com', 'connecticut', 'gmail', 'and', 'west', 'nevada', 'my', 'made', 'candy']
- ['whole', 'together', 'separately', 'portion', 'mixup', 'mixing', 'mixes', 'mixed', 'mix', 'married']
- ['preparation', 'solves', 'math', 'formulation', 'formulate', 'formulas', 'formula', 'factors', 'factor', 'equation']
- ['hospital', 'pharmacy', 'medication', 'medic', 'advil', 'amoxicillin', 'med', 'lithium', 'drugs', 'drug']

I :
- ['jones', 'com', 'connecticut', 'gmail', 'and', 'west', 'nevada', 'my', 'made', 'candy']
- ['riteaid', 'allegra', 'clarinet', 'aid', 'walgreens', 'george', 'smith', 'god', 'daniels', 'am']
- ['id'