In [None]:
!pip install stylecloud

# <p style="background-color:#73d2de;font-family:newtimeroman;color:#00509d;font-size:150%;text-align:center;border-radius:40px 40px;">NBME SCORE CLINICAL PATIENT NOTES</p>

<h1 align='center'>Introduction 📝</h1>
The goal of this competition is to identify the relevant features within each patient note, with a special focus on the patient history portions of the notes. I will deep dive into the dataset to understand and get all the insights from the data.

<h1 align='center'>Dataset Info 📈</h1>
<h2>Training Data</h2>
<b>patient_notes.csv - A collection of about 40,000 Patient Note history portions.</b><br>

* ```pn_num``` - A unique identifier for each patient note.
* ```case_num``` - A unique identifier for the clinical case a patient note represents.
* ```pn_history``` - The text of the encounter as recorded by the test taker.

<b>features.csv - The rubric of features (or key concepts) for each clinical case.</b><br>
* ```feature_num``` - A unique identifier for each feature.
* ```case_num``` - A unique identifier for each case.
* ```feature_text``` - A description of the feature.

<b>train.csv - Feature annotations for 1000 of the patient notes, 100 for each of ten cases.</b><br>
* ```id``` - Unique identifier for each patient note / feature pair.
* ```pn_num``` - The patient note annotated in this row.
* ```feature_num``` - The feature annotated in this row.
* ```case_num``` - The case to which this patient note belongs.
* ```annotation``` - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
* ```location``` - Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon ;.

<h1 align='center'>Evaluation Metric 📐</h1>
The competition is evaluated by a micro-averaged F1 score.

<img src="https://user-images.githubusercontent.com/55939250/153265944-04388967-90b3-4fb8-84ce-04c538bcd550.png" width=700px height=400px>

<div class="alert alert-block alert-warning">
    <h2 align='center'>Please do an upvote if you found the kernel useful.</h2>
</div>

# <p style="background-color:#73d2de;font-family:newtimeroman;color:#00509d;font-size:150%;text-align:center;border-radius:40px 40px;">TABLE OF CONTENTS</p>
<ul style="list-style-type:square">
    <li><a href="#1">Importing Libraries</a></li>
    <li><a href="#2">Reading the data</a></li>
    <li><a href="#3">Explore</a></li>
    <ul style="list-style-type:disc">
        <li><a href="#3.1">Train Data</a></li>
        <li><a href="#3.2">Features Data</a></li>
        <li><a href="#3.3">Patient Notes Data</a></li>
    </ul>
</ul>



<a id='1'></a>
# <p style="background-color:#73d2de;font-family:newtimeroman;color:#00509d;font-size:150%;text-align:center;border-radius:40px 40px;">IMPORTING LIBRARIES</p>

In [None]:
import gc
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
import plotly.express as px
import plotly.figure_factory as ff
import spacy
from spacy import displacy
import stylecloud
from IPython.display import Image
from nltk.corpus import stopwords
from collections import Counter, defaultdict

import warnings
warnings.simplefilter('ignore')


<a id='2'></a>
# <p style="background-color:#73d2de;font-family:newtimeroman;color:#00509d;font-size:150%;text-align:center;border-radius:40px 40px;">READING THE DATA</p>

In [None]:
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train.head()

In [None]:
train.info()

In [None]:
feature = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
feature.head()

In [None]:
feature.info()

In [None]:
patient_note = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
patient_note.head()

In [None]:
patient_note.info()

<a id='3'></a>
# <p style="background-color:#73d2de;font-family:newtimeroman;color:#00509d;font-size:150%;text-align:center;border-radius:40px 40px;">EXPLORE</p>

<a id='3.1'></a>
# Train Data

### We will start by looking the distribution of case_num.

In [None]:
plt.figure(figsize=(15, 9))

sns.countplot(x='case_num', data=train, palette = 'flare')
plt.title('Distribution of Case_Num in Training Data', fontsize=15)

plt.show()

### Then let's look the distribution of pn_num.

In [None]:
plt.figure(figsize=(20, 6))

sns.histplot(x='pn_num', data=train, hue='case_num', bins=50, palette='rainbow')
plt.title('Distribution of Pn_Num in Training Data', fontsize=15)

plt.show()

### After that let's analyse the annotations. For this, first of all we will focus at the number of features in the annotations and then we will consider the most common words in the annotations.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 9))
fig.suptitle('Distribution of Number of Annotations', size=15)
train['annot_features'] = train['annotation'].apply(lambda x : len(ast.literal_eval(x))) 
sns.countplot(x=train['annot_features'], palette='crest', ax=ax[0])

sizes = []
no_annotations = len(train[train['annot_features']==0])
sizes.append(no_annotations)
annotated = len(train) - len(train[train['annot_features']==0])
sizes.append(annotated)

print('Number of Rows with no Annotations -', no_annotations)
print('Number of Rows with Annotations -', annotated)

labels = ['Annotation', 'No Annotation']
colors = ['#72CC50', '#54C2CC']
ax[1].pie(sizes, colors=colors, startangle=90, labels=labels,
        autopct='%1.0f%%', pctdistance=0.7,textprops={'fontsize':12}, counterclock=False)

plt.show()

In [None]:
def join_fea(annotation):
    text = [word for words in ast.literal_eval(annotation) for word in words.split()]
    return text

train['text'] = train['annotation'].apply(lambda x : join_fea(x))

top = Counter([word for words in train['text'] for word in words])
df_temp = pd.DataFrame(top.most_common(25))
df_temp.columns = ['Common_words','count']

fig = px.bar(df_temp, x='count', y='Common_words', title='Most Common Words(including stopwords) in Annotations', orientation='h', width=900,height=700, color='Common_words')
fig.show()

def join_fea(annotation):
    text = [word for words in ast.literal_eval(annotation) for word in words.split() if word not in set(stopwords.words('english'))]
    return text

train['text'] = train['annotation'].apply(lambda x : join_fea(x))

top = Counter([word for words in train['text'] for word in words])
df_temp = pd.DataFrame(top.most_common(25))
df_temp.columns = ['Common_words','count']

fig = px.bar(df_temp, x='count', y='Common_words', title='Most Common Words(excluding stopwords) in Annotations', orientation='h', width=900,height=700, color='Common_words')
fig.show()

# Features Data

### Now we will analyse the features data. We will start by looking the distribution of case_num.

In [None]:
plt.figure(figsize=(15, 9))

sns.countplot(x='case_num', data=feature, palette = 'Purples_r')
plt.title('Distribution of Case_Num in Features Data', fontsize=15)

plt.show()

### Then we will inspect some feature_text properties like number of words in it and the average word length distributions.

In [None]:
text_len = feature['feature_text'].str.split('-').map(lambda x : len(x))
fig = ff.create_distplot([text_len], ['feature'], colors=['#2ca02c'])
fig.update_layout(title_text='Word Count Distribution')
fig.show()

avg_word_len = feature['feature_text'].str.split('-').apply(lambda x : [len(i) for i in x]).map(lambda x : np.mean(x))
fig = ff.create_distplot([avg_word_len], ['feature'], colors=['#ffa408'])
fig.update_layout(title_text='Average Word Length Distribution')
fig.show()

### Now let's perform N-gram analysis on feature_text.

In [None]:
def generate_n_grams(text,ngram=1):
    words=[word for word in text.split('-')]
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans

# UNIGRAM
counts=defaultdict(int)
for text in feature['feature_text']:
    for word in generate_n_grams(text):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color='#FF4040')
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in UNIGRAM ANALYSIS", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# BIGRAM
counts=defaultdict(int)
for text in feature['feature_text']:
    for word in generate_n_grams(text, ngram=2):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color='#00BFFF')
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in BIGRAM ANALYSIS", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# TRIGRAM
counts=defaultdict(int)
for text in feature['feature_text']:
    for word in generate_n_grams(text, ngram=3):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color='#BF3EFF')
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in TRIGRAM ANALYSIS", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

## Wordcloud of feature_text

In [None]:
# Reference - https://www.kaggle.com/kapakudaibergenov/stylecloud/notebook
concat_data = ' '.join([i for x in feature['feature_text'].str.split('-') for i in x])
stylecloud.gen_stylecloud(text=concat_data,
                          icon_name='fas fa-eye',
                          palette='cmocean.sequential.Matter_10',
                          background_color='black',
                          gradient='horizontal',
                          size=1024)


Image(filename="./stylecloud.png", width=1024, height=768)

<a id='3.3'></a>
# Patient Notes Data
### Lastly, we will analyse the patient_notes data. We will start by looking the distribution of case_num.

In [None]:
plt.figure(figsize=(15, 9))

sns.countplot(x='case_num', data=patient_note, palette = 'winter')
plt.title('Distribution of Case_Num in Patient Notes Data', fontsize=15)

plt.show()

### Then similarly we will inspect some patient history notes properties like number of words in it and the average word length distributions.

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(20, 12))

text_len = patient_note['pn_history'].str.split().map(lambda x : len(x))
sns.histplot(text_len, element="step", kde=True, color='#2ca02c', ax=ax[0])
ax[0].set_title('Word Count Distribution', size=20)

avg_word_len = patient_note['pn_history'].str.split().apply(lambda x : [len(i) for i in x]).map(lambda x : np.mean(x))
sns.histplot(avg_word_len, element="step", kde=True, color='#ffa408', ax=ax[1])
ax[1].set_title('Average Word Length Distribution', size=20)

plt.tight_layout()
plt.show()

### After that let's perform N-gram analysis on patient history notes.

In [None]:
def generate_n_grams(text,ngram=1):
    words=[word for word in text.split()]
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans

# UNIGRAM
counts=defaultdict(int)
for text in patient_note['pn_history']:
    for word in generate_n_grams(text):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color='#FF4040')
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in UNIGRAM ANALYSIS", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# BIGRAM
counts=defaultdict(int)
for text in patient_note['pn_history']:
    for word in generate_n_grams(text, ngram=2):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color='#00BFFF')
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in BIGRAM ANALYSIS", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

# TRIGRAM
counts=defaultdict(int)
for text in patient_note['pn_history']:
    for word in generate_n_grams(text, ngram=3):
        counts[word]+=1
    
df=pd.DataFrame(sorted(counts.items(),key=lambda x:x[1],reverse=True))
pd1=df[0][:50]
pd2=df[1][:50]

plt.figure(1,figsize=(30,30))
sns.barplot(pd2,pd1, color='#BF3EFF')
plt.xlabel("Count", fontsize=20)
plt.ylabel("Words in dataframe", fontsize=20)
plt.title("Top 50 words in TRIGRAM ANALYSIS", fontsize=30)
plt.tick_params(axis='both', labelsize=20)
plt.show()

## Annotations Visualization

In [None]:
# Reference - https://www.kaggle.com/vanguarde/nbme-eda
nlp = spacy.blank('en')
loc = list(train.loc[(train.pn_num==224) & (train.location!='[]'), 'location'].str.replace("['", "", regex=False).str.replace("']", "", regex=False))
text = patient_note[patient_note.pn_num==224].pn_history.values[0]
doc = nlp.make_doc(text)
ents = []
for l in loc:
    start, end = l.split(' ')
    ent = doc.char_span(int(start), int(end), label='annotation')
    ents.append(ent)
doc.ents = ents
color = {"Annotation": '#A32EFF'}
displacy.render(doc, style="ent", jupyter=True, options={'colors': color})

## Wordcloud of patient history notes

In [None]:
# Reference - https://www.kaggle.com/kapakudaibergenov/stylecloud/notebook
concat_data = ' '.join([i for i in patient_note.pn_history.astype(str)])
stylecloud.gen_stylecloud(text=concat_data,
                          icon_name='fas fa-tree',
                          palette='cartocolors.qualitative.Bold_6',
                          background_color='black',
                          gradient='horizontal',
                          size=1024)


Image(filename="./stylecloud.png", width=1024, height=1024)

<div class="alert alert-block alert-info">
If you are a beginner to NLP then I would refer my another notebook and it will definitely help you to start in NLP:-
</div>
<div class="row" align="center">
    <div class = "card">
      <div class = "card-body" style = "width: 20rem; ">
        <h5 class = "card-title" style = "font-size: 1.2em;"align="center">Natural Language Processing</h5>
          <img src="https://www.asksid.ai/wp-content/uploads/2021/02/an-introduction-to-natural-language-processing-with-python-for-seos-5f3519eeb8368.png" class = "card_img-top" style = "padding: 2% 0;width:19rem;height:10rem;border-radius:30%">
        <p class="card-text" style = "font-size: 1.0em;text-align: center "><b>(Most) NLP Techniques📚</b></p>
        <a href = "https://www.kaggle.com/utcarshagrawal/commonlit-eda-most-nlp-techniques" class = "btn btn-info btn-lg active"  role = "button" style = "color: white; margin: 0 15% 0 25%" data-toggle = "popover" title = "Click">Click here</a>
      </div>
    </div>
  </div>

<div class="alert alert-block alert-info">
    <h2 align='center'>🔎THANK YOU🔎</h2>
    <h2 align='center'>Please consider upvoting the kernel if you found it useful.</h2>
</div>