# Runtime ≈	1 minute




# This notebook completes the process of wrangling the text for EDA and other future analyses.

# The processing is the following order:
*   Scispacy - Acronyms
*   General Cleaning
* Spacy - Lemmatization

In [1]:
try:
    from google.colab import drive

    drive.mount('./drive/')

    %cd drive/My \ Drive/Text_Summarization

except:
  
    print("No Colab Environment")

No Colab Environment


In [2]:
import json
import pandas as pd
import numpy as np
import re
import seaborn as sns

import matplotlib

matplotlib.rcParams["figure.figsize"] = (20, 7)

# Load Data

In [3]:
with open("../Data/raw/Telehealth_article_texts.txt") as f:
    #Skip header
    for i in range(4):
        next(f) 
    corpus = f.read()

In [4]:
dict_articles = json.loads(corpus.replace("\n",""))
df_articles = pd.DataFrame.from_dict(dict_articles,orient="index",columns=["Content"]).reset_index().drop(columns=["index"])

df_articles.head()

Unnamed: 0,Content
0,Veterans face a variety of stressors related t...
1,The impact of the COVID-19 pandemic on the chi...
2,"In November 2019, COVID-19—the infectious, hig..."
3,Anxiety and depressive disorders are among the...
4,"In mid-March, 2020, the authors—as well as the..."


In [5]:
df_metadata = pd.read_excel("../Data/raw/Metadata_telehealth_article_key_2.25.xlsx",sheet_name="Tied_to_Notebook",index_col="Index")

df_metadata.head()

Unnamed: 0_level_0,Journal Title,Article Title,Date Published,Authors,Abstract,Keywords,Citation
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Psychological Servies,The Effectiveness of Telepsychology With Veter...,2021,Michael J. McClellan; Richard Osbaldiston; Ron...,Veterans face a variety of stressors due to th...,"KEYWORDS:\n\ntelepsychology, meta-analysis, ve...","McClellan, M. J., Osbaldiston, R., Wu, R., Yea..."
1,"Psychology, Public Policy, and Law",Making the Case for Videoconferencing and Remo...,2021,Milfred D. Dale; Desiree Smith,The COVID-19 pandemic and its requirements for...,"KEYWORDS:\n\nremote child custody evaluations,...","Dale, M. D., & Smith, D. (2021). Making the ca..."
2,"Psychology, Public Policy, and Law","Forensic E-Mental Health: Review, Research Pri...",2021,Lauren E. Kois; Jennifer Cox; Ashley T. Peck,Forensic e-mental health is an area of psychol...,"KEYWORDS:\n\nforensic e-mental health, telehea...","Kois, L. E., Cox, J., & Peck, A. T. (2021). Fo..."
3,Training and Education in Professional Psychology,Moving Toward a New Era of Telepsychology in U...,2021,Allison L. Baier; Sarah Danzo,Many university training clinics are facing nu...,"KEYWORDS:\n\neducation and training, telepsych...","Baier, A. L., & Danzo, S. (2021). Moving towar..."
4,Practice Innovations,Navigating Changes in the Physical and Psychol...,2021,Liat Shklarski; Allison Abrams; Elana Bakst,The emergence of the Covid-19 pandemic at the ...,"KEYWORDS:\n\nremote psychotherapy, Covid-19, p...","Shklarski, L., Abrams, A., & Bakst, E. (2021)...."


In [6]:
#Ensure both Indexes are of same type before merge
assert df_metadata.index.dtype == df_articles.index.dtype

#Merge dataframes
df_metadata = df_metadata.merge(df_articles,left_index=True,right_index=True,how="left")

df_metadata.head()

Unnamed: 0_level_0,Journal Title,Article Title,Date Published,Authors,Abstract,Keywords,Citation,Content
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Psychological Servies,The Effectiveness of Telepsychology With Veter...,2021,Michael J. McClellan; Richard Osbaldiston; Ron...,Veterans face a variety of stressors due to th...,"KEYWORDS:\n\ntelepsychology, meta-analysis, ve...","McClellan, M. J., Osbaldiston, R., Wu, R., Yea...",Veterans face a variety of stressors related t...
1,"Psychology, Public Policy, and Law",Making the Case for Videoconferencing and Remo...,2021,Milfred D. Dale; Desiree Smith,The COVID-19 pandemic and its requirements for...,"KEYWORDS:\n\nremote child custody evaluations,...","Dale, M. D., & Smith, D. (2021). Making the ca...",The impact of the COVID-19 pandemic on the chi...
2,"Psychology, Public Policy, and Law","Forensic E-Mental Health: Review, Research Pri...",2021,Lauren E. Kois; Jennifer Cox; Ashley T. Peck,Forensic e-mental health is an area of psychol...,"KEYWORDS:\n\nforensic e-mental health, telehea...","Kois, L. E., Cox, J., & Peck, A. T. (2021). Fo...","In November 2019, COVID-19—the infectious, hig..."
3,Training and Education in Professional Psychology,Moving Toward a New Era of Telepsychology in U...,2021,Allison L. Baier; Sarah Danzo,Many university training clinics are facing nu...,"KEYWORDS:\n\neducation and training, telepsych...","Baier, A. L., & Danzo, S. (2021). Moving towar...",Anxiety and depressive disorders are among the...
4,Practice Innovations,Navigating Changes in the Physical and Psychol...,2021,Liat Shklarski; Allison Abrams; Elana Bakst,The emergence of the Covid-19 pandemic at the ...,"KEYWORDS:\n\nremote psychotherapy, Covid-19, p...","Shklarski, L., Abrams, A., & Bakst, E. (2021)....","In mid-March, 2020, the authors—as well as the..."


In [7]:
JournalCrosswalk = pd.read_excel('../Data/raw/JournalTitles.xlsx')
JournalCrosswalk.set_index('Journal', inplace=True)

In [8]:
JournalCrosswalk.head()

Unnamed: 0_level_0,Journal Code,Peer Review System,Journal Subfield
Journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Asian American Journal of Psychology,AAP,EM (AJP),Social Psychology
Journal of Abnormal Psychology,ABN,EM,Clinical & Counseling Psychology
"Psychology of Aesthetics, Creativity, and the Arts",ACA,EM,Social Psychology
Psychology of Addictive Behaviors,ADB,EM,Health Psychology & Medicine
American Psychologist,AMP,EM,General Psychology


In [9]:
df_metadata.dtypes

Journal Title     object
Article Title     object
Date Published     int64
Authors           object
Abstract          object
Keywords          object
Citation          object
Content           object
dtype: object

In [10]:
df_metadata["Content_Length"] = df_metadata["Content"].apply(lambda text: len(text))
df_metadata["Abstract_Length"] = df_metadata["Abstract"].apply(lambda text: len(str(text)))
df_metadata["Parsed_Keywords"] = df_metadata["Keywords"].apply(lambda keywords: str(keywords).replace("\n\n"," ").split()[1:])
df_metadata["Parsed_Keywords_Length"] = df_metadata["Parsed_Keywords"].apply(lambda text: len(text))
df_metadata["Journal Title"] = df_metadata["Journal Title"].replace('Psychological Servies', 'Psychological Services')
subfield = []
for i in df_metadata['Journal Title']:
  #print(i)
  try:
    if i == 'Clinical Psychology: Science and Practice': # Journal is missing from Crosswalk
      subfield.append('Clinical & Counseling Psychology')
    else:
      subfield.append(JournalCrosswalk['Journal Subfield'][i])
  except:
    subfield.append('No Match')
    
df_metadata["Subfield"] = subfield

#Have an idea of reference amount per document
df_metadata["et_al_Count"] = df_metadata["Content"].apply(lambda text: len(list(re.finditer("et al",text))))


In [11]:
df_metadata.head()

Unnamed: 0_level_0,Journal Title,Article Title,Date Published,Authors,Abstract,Keywords,Citation,Content,Content_Length,Abstract_Length,Parsed_Keywords,Parsed_Keywords_Length,Subfield,et_al_Count
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Psychological Services,The Effectiveness of Telepsychology With Veter...,2021,Michael J. McClellan; Richard Osbaldiston; Ron...,Veterans face a variety of stressors due to th...,"KEYWORDS:\n\ntelepsychology, meta-analysis, ve...","McClellan, M. J., Osbaldiston, R., Wu, R., Yea...",Veterans face a variety of stressors related t...,37477,2411,"[telepsychology,, meta-analysis,, veteran,, vi...",5,Clinical & Counseling Psychology,50
1,"Psychology, Public Policy, and Law",Making the Case for Videoconferencing and Remo...,2021,Milfred D. Dale; Desiree Smith,The COVID-19 pandemic and its requirements for...,"KEYWORDS:\n\nremote child custody evaluations,...","Dale, M. D., & Smith, D. (2021). Making the ca...",The impact of the COVID-19 pandemic on the chi...,74025,1577,"[remote, child, custody, evaluations,, videoco...",10,Forensic Psychology,48
2,"Psychology, Public Policy, and Law","Forensic E-Mental Health: Review, Research Pri...",2021,Lauren E. Kois; Jennifer Cox; Ashley T. Peck,Forensic e-mental health is an area of psychol...,"KEYWORDS:\n\nforensic e-mental health, telehea...","Kois, L. E., Cox, J., & Peck, A. T. (2021). Fo...","In November 2019, COVID-19—the infectious, hig...",63568,1842,"[forensic, e-mental, health,, telehealth,, tel...",8,Forensic Psychology,128
3,Training and Education in Professional Psychology,Moving Toward a New Era of Telepsychology in U...,2021,Allison L. Baier; Sarah Danzo,Many university training clinics are facing nu...,"KEYWORDS:\n\neducation and training, telepsych...","Baier, A. L., & Danzo, S. (2021). Moving towar...",Anxiety and depressive disorders are among the...,30147,1650,"[education, and, training,, telepsychology,, C...",6,Clinical & Counseling Psychology,21
4,Practice Innovations,Navigating Changes in the Physical and Psychol...,2021,Liat Shklarski; Allison Abrams; Elana Bakst,The emergence of the Covid-19 pandemic at the ...,"KEYWORDS:\n\nremote psychotherapy, Covid-19, p...","Shklarski, L., Abrams, A., & Bakst, E. (2021)....","In mid-March, 2020, the authors—as well as the...",42367,1871,"[remote, psychotherapy,, Covid-19,, pandemic,,...",6,Clinical & Counseling Psychology,14


In [12]:
#troubleshooting
#emental health

#df_metadata["Content"].iloc[2][4633:5000]
#df_metadata["Clean_Content"].iloc[2][4633:5000]

## Acronyms - Include as Vocabulary for Paper

In [13]:
#Sci Spacy
#!pip install scispacy
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

#Spacy org
#!pip install spacy
#!python3 -m spacy download en_core_web_sm
#!python3 -m spacy download en_core_web_md


## Source: https://youtu.be/2_HSKDALwuw?t=708
## Abbreviation Detector Works by:
## 1. Finding Parentheses
## 2. Look up to 10 words behind the bracket
## 3. Greedily choose definition: Look for words next to each other, that in the right order start with the letters in the acronym

In [14]:
#ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing: https://www.semanticscholar.org/paper/ScispaCy%3A-Fast-and-Robust-Models-for-Biomedical-Neumann-King/de28ec1d7bd38c8fc4e8ac59b6133800818b4e29
#https://github.com/allenai/SciSpaCy
import spacy
from scispacy.abbreviation import AbbreviationDetector


nlp = spacy.load("en_core_web_md")
nlp.add_pipe("abbreviation_detector")

nlp.Defaults.stop_words |= {"PRON","ll","ve","eg"}

corpus = df_metadata["Content"]

docs = list(nlp.pipe(corpus,disable=["ner","parser","textcat"]))




In [15]:
abrv_dict = dict()

for index,doc in enumerate(docs):

    for abrv in doc._.abbreviations:
        
        if str(abrv) not in abrv_dict.keys():

            abrv_dict[str(abrv)] = {"LongForm":str(abrv._.long_form),"Document":{index}}
            
        else:
            abrv_dict[str(abrv)]["Document"].add(index)

        #print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form} \t Document: {index}")

In [16]:
#Source https://stackoverflow.com/questions/22281059/set-object-is-not-json-serializable

def set_default(obj):
    if isinstance(obj, set):
        return list(obj)
    raise TypeError

In [17]:
#Write original abbriviation dictionary
with open("../references/abbreviation_table.json","w") as f:
    
    json.dump(abrv_dict, f, indent = 4,default=set_default)
    

#Remove misidentified abbreviations
with open("../references/Incorrect_abbrev.json", "r") as f:
    
    misidentified_abrv = json.load(f)
    
for key in misidentified_abrv.keys():
    
    abrv_dict.pop(key)
    
    
#Correct LongForm of abbreviations    
with open("../references/Abbreviation_corrections.json", "r") as f:
    
    correction_abrv = json.load(f)
    

for key in abrv_dict.keys():
    
    if key in correction_abrv.keys():
        
        abrv_dict[key]["LongForm"] = correction_abrv[key]
        

#Add abbreviations
with open("../references/Add_to_abbreviation_table.json", "r") as f:
    
    add_abrv = json.load(f)
    

for key in add_abrv.keys():
    
    abrv_dict[key] = add_abrv[key]
    
    
        
with open("../references/abbreviation_table_processed.json","w") as f:
    
    json.dump(abrv_dict, f, indent = 4, default=set_default)

In [18]:
abrv_dict["PTSD"]["LongForm"]

'posttraumatic stress disorder'

In [19]:
df_abrv = (pd.read_json("../references/abbreviation_table_processed.json")
           .T
           .reset_index()
           .rename(columns={"index":"Term"}))

df_abrv.head()

Unnamed: 0,Term,LongForm,Document
0,PTSD,posttraumatic stress disorder,"[0, 34, 35, 36, 6, 38, 42, 12, 15, 21, 27]"
1,FTF,face-to-face,"[0, 16]"
2,TAU,treatments as usual,"[0, 40, 21]"
3,PCL,PTSD Checklist,[0]
4,BDI,Beck Depression Inventory,[0]


In [20]:
df_abrv[df_abrv["Term"] == "PTSD"].head()

Unnamed: 0,Term,LongForm,Document
0,PTSD,posttraumatic stress disorder,"[0, 34, 35, 36, 6, 38, 42, 12, 15, 21, 27]"


In [21]:
#Validation
df_abrv[df_abrv["Term"] == "NYH"]

Unnamed: 0,Term,LongForm,Document
154,NYH,New York Harbor Healthcare System,


## Clean Data
1.   Lowercase
2.   Remove Punctuation
3.   White Spaces



In [22]:
documents_tokens = []


for index,doc in enumerate(docs):
    
    document_tokens = []
    
    for token in doc:
        
        #removes stopwords and punct
        if not token.is_stop and not token.is_punct:
            
            if str(token) in abrv_dict.keys():
                
                document_tokens.append(abrv_dict[str(token)]["LongForm"])#Replace short-form with long-form
                
            else:
                
                document_tokens.append(token.lemma_)
            
            
    documents_tokens.append(" ".join(document_tokens))

In [23]:
df_metadata["Stopwords_Lemma_Longform_Clean_Content"] = documents_tokens

In [24]:
df_metadata.head()

Unnamed: 0_level_0,Journal Title,Article Title,Date Published,Authors,Abstract,Keywords,Citation,Content,Content_Length,Abstract_Length,Parsed_Keywords,Parsed_Keywords_Length,Subfield,et_al_Count,Stopwords_Lemma_Longform_Clean_Content
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,Psychological Services,The Effectiveness of Telepsychology With Veter...,2021,Michael J. McClellan; Richard Osbaldiston; Ron...,Veterans face a variety of stressors due to th...,"KEYWORDS:\n\ntelepsychology, meta-analysis, ve...","McClellan, M. J., Osbaldiston, R., Wu, R., Yea...",Veterans face a variety of stressors related t...,37477,2411,"[telepsychology,, meta-analysis,, veteran,, vi...",5,Clinical & Counseling Psychology,50,veteran face variety stressor relate military ...
1,"Psychology, Public Policy, and Law",Making the Case for Videoconferencing and Remo...,2021,Milfred D. Dale; Desiree Smith,The COVID-19 pandemic and its requirements for...,"KEYWORDS:\n\nremote child custody evaluations,...","Dale, M. D., & Smith, D. (2021). Making the ca...",The impact of the COVID-19 pandemic on the chi...,74025,1577,"[remote, child, custody, evaluations,, videoco...",10,Forensic Psychology,48,impact coronavirus disease 2019 pandemic child...
2,"Psychology, Public Policy, and Law","Forensic E-Mental Health: Review, Research Pri...",2021,Lauren E. Kois; Jennifer Cox; Ashley T. Peck,Forensic e-mental health is an area of psychol...,"KEYWORDS:\n\nforensic e-mental health, telehea...","Kois, L. E., Cox, J., & Peck, A. T. (2021). Fo...","In November 2019, COVID-19—the infectious, hig...",63568,1842,"[forensic, e-mental, health,, telehealth,, tel...",8,Forensic Psychology,128,November 2019 coronavirus disease 2019 infecti...
3,Training and Education in Professional Psychology,Moving Toward a New Era of Telepsychology in U...,2021,Allison L. Baier; Sarah Danzo,Many university training clinics are facing nu...,"KEYWORDS:\n\neducation and training, telepsych...","Baier, A. L., & Danzo, S. (2021). Moving towar...",Anxiety and depressive disorders are among the...,30147,1650,"[education, and, training,, telepsychology,, C...",6,Clinical & Counseling Psychology,21,anxiety depressive disorder common impair ment...
4,Practice Innovations,Navigating Changes in the Physical and Psychol...,2021,Liat Shklarski; Allison Abrams; Elana Bakst,The emergence of the Covid-19 pandemic at the ...,"KEYWORDS:\n\nremote psychotherapy, Covid-19, p...","Shklarski, L., Abrams, A., & Bakst, E. (2021)....","In mid-March, 2020, the authors—as well as the...",42367,1871,"[remote, psychotherapy,, Covid-19,, pandemic,,...",6,Clinical & Counseling Psychology,14,mid March 2020 author majority therapist advis...


In [25]:
def unwanted_tokens(text):
    
    docuemnt = text
    
    remove_ngrams = ["large image page new","image page new window", "page new window Download","image page new",
                 "page new window","new window Download","image page","large image","1TABLES figurestablefigure thumbnailtable",
                "FIGUREStable","DOWNLOAD","Download","et al"]
    
    for ngram in remove_ngrams:
        
        docuemnt = docuemnt.replace(ngram,"")
        
    return docuemnt
        
    
    

In [26]:
df_metadata["Clean_Content"] = (df_metadata["Stopwords_Lemma_Longform_Clean_Content"].apply(lambda text: unwanted_tokens(text)))



In [27]:
df_metadata.head()

Unnamed: 0_level_0,Journal Title,Article Title,Date Published,Authors,Abstract,Keywords,Citation,Content,Content_Length,Abstract_Length,Parsed_Keywords,Parsed_Keywords_Length,Subfield,et_al_Count,Stopwords_Lemma_Longform_Clean_Content,Clean_Content
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Psychological Services,The Effectiveness of Telepsychology With Veter...,2021,Michael J. McClellan; Richard Osbaldiston; Ron...,Veterans face a variety of stressors due to th...,"KEYWORDS:\n\ntelepsychology, meta-analysis, ve...","McClellan, M. J., Osbaldiston, R., Wu, R., Yea...",Veterans face a variety of stressors related t...,37477,2411,"[telepsychology,, meta-analysis,, veteran,, vi...",5,Clinical & Counseling Psychology,50,veteran face variety stressor relate military ...,veteran face variety stressor relate military ...
1,"Psychology, Public Policy, and Law",Making the Case for Videoconferencing and Remo...,2021,Milfred D. Dale; Desiree Smith,The COVID-19 pandemic and its requirements for...,"KEYWORDS:\n\nremote child custody evaluations,...","Dale, M. D., & Smith, D. (2021). Making the ca...",The impact of the COVID-19 pandemic on the chi...,74025,1577,"[remote, child, custody, evaluations,, videoco...",10,Forensic Psychology,48,impact coronavirus disease 2019 pandemic child...,impact coronavirus disease 2019 pandemic child...
2,"Psychology, Public Policy, and Law","Forensic E-Mental Health: Review, Research Pri...",2021,Lauren E. Kois; Jennifer Cox; Ashley T. Peck,Forensic e-mental health is an area of psychol...,"KEYWORDS:\n\nforensic e-mental health, telehea...","Kois, L. E., Cox, J., & Peck, A. T. (2021). Fo...","In November 2019, COVID-19—the infectious, hig...",63568,1842,"[forensic, e-mental, health,, telehealth,, tel...",8,Forensic Psychology,128,November 2019 coronavirus disease 2019 infecti...,November 2019 coronavirus disease 2019 infecti...
3,Training and Education in Professional Psychology,Moving Toward a New Era of Telepsychology in U...,2021,Allison L. Baier; Sarah Danzo,Many university training clinics are facing nu...,"KEYWORDS:\n\neducation and training, telepsych...","Baier, A. L., & Danzo, S. (2021). Moving towar...",Anxiety and depressive disorders are among the...,30147,1650,"[education, and, training,, telepsychology,, C...",6,Clinical & Counseling Psychology,21,anxiety depressive disorder common impair ment...,anxiety depressive disorder common impair ment...
4,Practice Innovations,Navigating Changes in the Physical and Psychol...,2021,Liat Shklarski; Allison Abrams; Elana Bakst,The emergence of the Covid-19 pandemic at the ...,"KEYWORDS:\n\nremote psychotherapy, Covid-19, p...","Shklarski, L., Abrams, A., & Bakst, E. (2021)....","In mid-March, 2020, the authors—as well as the...",42367,1871,"[remote, psychotherapy,, Covid-19,, pandemic,,...",6,Clinical & Counseling Psychology,14,mid March 2020 author majority therapist advis...,mid March 2020 author majority therapist advis...


In [28]:
#Add to clean function: Different than a dash ord(8207) compared to 45 for normal dash
print(ord("-"),ord("—"))

45 8212


In [29]:
from yellowbrick.text import DispersionPlot
import sklearn.metrics

try:
    #Troubleshooting tokens to remove
    dispersion_text = [doc.split() for doc in df_metadata["Clean_Content"]]

    other_words = [token.split() for token in ['et al']]

    other_words_1D = np.unique(np.concatenate(other_words).reshape(-1))

    target_words = other_words_1D

    #Create the visualizer and draw the plot
    visualizer = DispersionPlot(target_words,ignore_case=False)
    _ = visualizer.fit(dispersion_text)

except:
    
    print("No words found")

  exec(code_obj, self.user_global_ns, self.user_ns)


No words found


In [30]:
df_metadata["Classification"] = (df_metadata["Date Published"].apply(lambda pub_date: "Covid" 
                                                                     if pub_date >= 2020 else "Pre-Covid" ))

In [None]:
df_metadata.to_csv("../Data/processed/Telehealth.csv",index=False)

# Trouble Shooting

## Dispersion Plot

In [32]:
from yellowbrick.text import DispersionPlot
import sklearn.metrics

try:

    #Troubleshooting tokens to remove
    dispersion_text = [doc.split() for doc in df_metadata["Clean_Content"]]

    other_words = [token.split() for token in ['kbinformation','binformation']]

    other_words_1D = np.unique(np.concatenate(other_words).reshape(-1))

    target_words = other_words_1D

    #Create the visualizer and draw the plot
    visualizer = DispersionPlot(target_words,ignore_case=False)
    _ = visualizer.fit(dispersion_text)
    
except:
    print("No Words to be found")

No Words to be found
