![](https://i.imgur.com/o0xPvMY.png)

In [None]:
!pip install textstat

import numpy as np
import pandas as pd 
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import textstat
import wandb
import spacy
nlp = spacy.load('en_core_web_sm')

from termcolor import colored
from wordcloud import WordCloud,STOPWORDS
from spacy import displacy
from nltk.tokenize import sent_tokenize, word_tokenize 

import warnings
warnings.filterwarnings("ignore")

<img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67">

I will be integrating W&B for visualizations and logging artifacts!

> [NBME - Score Clinical Patient Notes](https://wandb.ai/ruchi798/nbme?workspace=user-ruchi798)🏋️‍♀️
> 
> - To get the API key, an account is to be created on the [website](https://wandb.ai/home) first.
> - Next, use secrets to use API Keys more securely 🤫

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("api_key")

CONFIG = {'competition': 'nbme', '_wandb_kernel': 'ruch'}

os.environ["WANDB_SILENT"] = "true"

In [None]:
! wandb login $api_key

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

In [None]:
BASE_PATH = "../input/nbme-score-clinical-patient-notes/"
features_df = pd.read_csv(BASE_PATH + "features.csv")
patient_notes_df = pd.read_csv(BASE_PATH + "patient_notes.csv")
train_df = pd.read_csv(BASE_PATH + "train.csv")
test_df = pd.read_csv(BASE_PATH + "test.csv")
submission_df = pd.read_csv(BASE_PATH + "sample_submission.csv")

```features.csv```

A rubric describes the key concepts relevant to each case.

* feature_num - A unique identifier for each feature.
* case_num - A unique identifier for each case.
* feature_text - A description of the feature.

In [None]:
features_df.head()

In [None]:
features_df.nunique()

```patient_notes.csv``` 

Text detailing important information related by the patient during the encounter (physical exam and interview)

* pn_num - A unique identifier for each patient note.
* case_num - A unique identifier for the clinical case a patient note represents.
* pn_history - The text of the encounter as recorded by the test taker.

In [None]:
patient_notes_df.head()

In [None]:
patient_notes_df.nunique()

```train.csv```
* id - Unique identifier for each patient note / feature pair.
* pn_num - The patient note annotated in this row.
* feature_num - The feature annotated in this row.
* case_num - The case to which this patient note belongs.
* annotation - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
* location - Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon ;.

In [None]:
train_df.head()

In [None]:
train_df.nunique()

<center><img src="https://raw.githubusercontent.com/github/explore/8cf1837393d83900e767cc895dcc814d053e2ffe/topics/spacy/spacy.png"></center>

#### 📝 Using spaCy for visualizing annotations, NER and POS tagging! 

# 👀 Annotations, NER & POS

In [None]:
def patient_data(pn_num):
    subset = train_df[train_df['pn_num'] == pn_num]
    
    features_lst = subset['feature_num'].tolist()
    annotations_lst = subset['annotation'].tolist()
    
    subset_c = subset.copy()
    subset_c['location'] = subset_c['location'].apply(eval)
    subset_c['annotation'] = subset_c['annotation'].apply(eval)
    locations  = subset_c["location"]
    annotations = subset_c["annotation"]
    
    print("*"*80)
    print(colored("Patient Number: " + str(pn_num), 'green'))
    patient_history = patient_notes_df[patient_notes_df['pn_num']==pn_num]['pn_history'].item()
    
    print(colored("\nAnnotated Patient History", 'green'))
    
    ents = []
    for location in locations:
        for i in range(len(location)):
            for loc in location:
                val = loc.split()
                ents.append({
                'start': int(val[0]), 
                'end' :  int(val[1]),
                'label' : "Annotation"
                })
    ents = sorted(ents, key = lambda i: i['start'])

    doc = {
        'text' : patient_history,
        'ents' : ents
    }
    colors = {"Annotation" :"linear-gradient(to right, #2980b9, #6dd5fa, #ffffff);" } 
    options = {"colors": colors}
    spacy.displacy.render(doc, style='ent', options = options , manual=True, jupyter=True);
    
    print(colored("\nVisualizing NER", 'green'))
    doc = nlp(patient_history)
    displacy.render(doc, style='ent', jupyter = True)
    
    print(colored("\nVisualizing POS tagging", 'green'))
    sentences = sent_tokenize(patient_history)
    word_count = lambda sentence: len(word_tokenize(sentence))
    pos_text = max(sentences, key=word_count)  
    doc = nlp(pos_text)
    displacy.render(doc, style="dep")

    print(colored("\nFeatures", 'green'))
    for feature_num in features_lst:
        feature = features_df[features_df['feature_num'] == feature_num]['feature_text'][feature_num]
        print(colored(feature, 'blue'))
        
patient_data(16)
patient_data(46)
patient_data(100)

# ☁️ WordClouds

In [None]:
# color function for the wordcloud
def color_wc(word=None,font_size=None,position=None, orientation=None,font_path=None, random_state=None):
    h = int(360.0 * 150.0 / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(80, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

def create_wordcloud(df, col):
    print(colored(col, 'green'))
    
    run = wandb.init(project='nbme', job_type='image-visualization',name='wordCloud')
    
    fig = plt.gcf()
    fig.set_size_inches(16, 8)
    wc = WordCloud(stopwords=STOPWORDS,background_color="white", contour_width=2, contour_color='blue',width=1500, height=750,color_func=color_wc,max_words=150, max_font_size=256,random_state=42)
    wc.generate(' '.join(df[col]))
    fig = plt.imshow(wc, interpolation="bilinear")
    fig = plt.axis('off')
    
    wc_name =  "wordcloud_" + col
    wandb.log({wc_name: [wandb.Image(plt, caption="Wordcloud")]})
    run.finish()

create_wordcloud(train_df, 'annotation')

In [None]:
create_wordcloud(features_df, 'feature_text')

In [None]:
create_wordcloud(patient_notes_df, 'pn_history')

# 📊 Text properties 

In [None]:
#====== Function to plot wandb histogram ======
def plot_wb_hist(df,name,title):
    run = wandb.init(project='nbme', job_type='image-visualization',name=name)

    dt = [[x] for x in df[name]]
    table = wandb.Table(data=dt, columns=[name])
    wandb.log({title : wandb.plot.histogram(table, name, title=title)})

    run.finish()
    
def avg_word_len(df):
    df = df.str.split().apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x))
    return df

def plot_distribution(text_props, num_sub):
    fig, ax = plt.subplots(1,num_sub,figsize=(20,10))
    sns.kdeplot(data=text_props, x="text_len",color="#7209B7",ax=ax[0])
    ax[0].set_title("Character count distribution",font="Serif")
    
    sns.kdeplot(data=text_props, x="avg_text",color="#FFBA08",ax=ax[1])
    ax[1].set_title("Average word length distribution",font="Serif")
    
    if num_sub != 2:
        sns.kdeplot(data=text_props, x="lexicon_count",color="#F72585",ax=ax[2])
        ax[2].set_title("Word count distribution",font="Serif")
        
    plt.tight_layout()
    fig.subplots_adjust(wspace=0.2, hspace=0.2, top=0.93)
    plt.show()
    
def text_properties(df, col, num_sub):
    text_props = df.copy()
    text_len = df[col].str.len()
    avg_text = avg_word_len(df[col])
    lexicon_count = []
    sentence_count = []
    for i in range(len(df)):
        lc = textstat.lexicon_count(df[col][i])
        lexicon_count.append(lc)

    text_props['text_len'] = text_len
    text_props['lexicon_count'] = lexicon_count
    text_props['avg_text'] = avg_text
    
    print(colored(col, 'green'))
    plot_distribution(text_props, num_sub)
    return text_props
    
text_props = text_properties(train_df, 'annotation',3)
plot_wb_hist(text_props,"text_len","annotation: Character Count Distribution")
plot_wb_hist(text_props,"lexicon_count","annotation : Word Count Distribution")
plot_wb_hist(text_props,"avg_text","annotation : Average Word Length Distribution")

text_props = text_properties(features_df, 'feature_text',2)
plot_wb_hist(text_props,"text_len","feature_text : Character Count Distribution")
plot_wb_hist(text_props,"avg_text","feature_text : Average Word Length Distribution")

text_props = text_properties(patient_notes_df, 'pn_history',3)
plot_wb_hist(text_props,"text_len","pn_history : Character Count Distribution")
plot_wb_hist(text_props,"lexicon_count","pn_history : Word Count Distribution")
plot_wb_hist(text_props,"avg_text","pn_history : Average Word Length Distribution")

# 🔤 Abbreviations used 

In [None]:
pl = patient_notes_df['pn_history'].tolist()
pl1 = [i.split('\n', 1)[0] for i in pl]

pattern = re.compile('(.*?)\:')
vals = []
for item in range(len(pl1)):
    val = pattern.findall(pl1[item])
    if val:
        if(len(val[0]) < 10):
            vals.append(val[0])

print(set(vals))

Here's a snapshot of my [project](https://wandb.ai/ruchi798/nbme?workspace=user-ruchi798) ⬇️

![](https://i.imgur.com/5we1jz6.png)
