# NBME EDA
When Performing EDA on NLP we should consider 3 things
1. **Top Words** :: Find the most common words used in each feature
2. **Vocabalary** :: Take a look at unique number of words used
3. **Amount of profanity** :: Note the number of swear words used

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
features

In [None]:
p_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
p_notes
# pn_history - The text of the encounter as recorded by the test taker

In [None]:
df = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
df

#pn_num - A unique identifier for each patient note.
#case_num - A unique identifier for the clinical case a patient note represents.
#pn_history - The text of the encounter as recorded by the test taker

## Analysis of Training file

In [None]:
df.info()

* There are no missing value in the dataset
* case_num, pn_number and feature_num can stored as object as we don't need to perform any mathematical operation on it
* location - Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon
* annotation - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.

In [None]:
df.id.nunique()

In [None]:
#pn_num - A unique identifier for each patient note.
print('Total Number of patients')
df.pn_num.nunique()

In [None]:
df.describe(include='object')

In [None]:
df.annotation

In [None]:
### Applying group by on p_notes and using aggregate function count
p_notes.groupby("case_num").count()

In [None]:
### Using group by on features
features.groupby("case_num").count()

# <p style="background-color:#73d1ff;font-family:newtimeroman;color:#000000;font-size:120%;text-align:center;border-radius:20px 80px;">💭 WordCloud</p>
### Since most of the data are object type lets plot wordcloud to get a quick Idea about the frequent words

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = STOPWORDS
wc= WordCloud(background_color='Black',stopwords=stopwords, height=1080, width =1920)

plt.figure(figsize=(22,30))
plt.subplot(3,1,1)
body =df.annotation.dropna().to_string(index=False)
### Generate word cloud
wc.generate(body)
## Visualize
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("Looking Broadly at words of 'Annotation'", fontsize=15)
wc.to_file('body.png')

plt.subplot(3,1,2)
body =p_notes.pn_history.dropna().to_string(index=False)
### Generate word cloud
wc.generate(body)
## Visualize
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("Looking Broadly at words of 'pn_history'", fontsize=15)
wc.to_file('pn_history.png');

plt.subplot(3,1,3)
body =features.feature_text.dropna().to_string(index=False)
### Generate word cloud
wc.generate(body)
## Visualize
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("Looking Broadly at words of 'feature_text'", fontsize=15)
wc.to_file('feature_text.png');

# <p style="background-color:#73d1ff;font-family:newtimeroman;color:#000000;font-size:120%;text-align:center;border-radius:20px 80px;">✨ Data Visualization</p>

In [None]:
plt.figure(figsize=(14,16))
plt.subplot(3,1,1)
df.pn_num.hist(edgecolor='black', linewidth=1.3, bins=20, color='tab:orange')
plt.grid(False)
plt.title('Hist plot of pn_num', fontsize=20)

plt.subplot(3,1,2)
#sns.histplot(df.case_num)
df.case_num.hist(edgecolor='black', linewidth=1.3, bins=20, color='tab:purple')
plt.grid(False)
plt.title('Hist plot of case_num', fontsize=20)

plt.subplot(3,1,3)
#sns.histplot(df.feature_num)
df.feature_num.hist(edgecolor='black', linewidth=1.3, bins=20, color='tab:green')
plt.grid(False)
plt.title('Hist plot of feature_num', fontsize=20)

plt.tight_layout();

In [None]:
### Case No with respect to  pn_number and Feature

### ~~~~ Calculations ~~~~~###
pc= p_notes.groupby("case_num").count()
fc= features.groupby("case_num").count()

### ~~~~ Visualization ~~~~~###

plt.figure(figsize=(18,6))
plt.subplot(1,2,1)
sns.barplot(x = fc.index,y = 'feature_num' , data=fc, palette ='hls' )
plt.title('Distribution of Features for each case',fontsize=15)

plt.subplot(1,2,2)
sns.barplot(x = pc.index, y = 'pn_num' , data=pc, palette ='hls' )
plt.title('Distribution of patient notes for each case',fontsize=15)
plt.tight_layout();


<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp;<b><u>Observations:</u></b>
 
* <i> There are total of ```3``` columns and `143` rows in ```Features``` data.</i>
* <i> Patient Notes Data contain ```429``` observation with ```0```  missing values.</i>
* <i> Number of patients per case are unequally distributed with `Case 5` and `Case 8` having maximum and `Case 7` having minimum.</i>
* <i> Average length of `feature_text` column is `23.20`. </i>  
***Patient Notes:**
* <i> There are total of ```3``` columns and `42146` rows in ```Patient Notes``` data.</i>
* <i> Patient Notes Data contain ```126438``` observation with ```0```  missing values.</i>
* <i> Number of patients per case are unequally distributed with `Case 3` having maximum and `Case 1` having minimum.</i>
* <i> Average length of `ph_history column` is `818.17`. </i>

# <p style="background-color:#73d1ff;font-family:newtimeroman;color:#000000;font-size:120%;text-align:center;border-radius:20px 80px;">🧹 Data Cleaning</p>

In [None]:
### annotation seems in odd format lets explore it in details 
df.annotation[0]

In [None]:
df.annotation.head(50)

In [None]:
### Replacing '[',']' ,single quote from data frame
# df['annotation'] = df['annotation'].replace({'[': '',']': ''}, regex=True)
df['annotation_new'] = df['annotation'].str.replace('[[,]]', '')

In [None]:
df_senti= df.drop('annotation',axis=1)
df_senti

### How to remove "[", "]" , "'" and "'" from dataframe??

In [None]:
### p_note contains age and gender of patient
p_notes.pn_history.sample(60)

In [None]:
p_notes.pn_history[10249]

In [None]:
p_notes.pn_history[624]

In [None]:
features.feature_text.head(60)

In [None]:
features.feature_text[43]

In [None]:
features.feature_text[23]

# <p style="background-color:#73d1ff;font-family:newtimeroman;color:#000000;font-size:120%;text-align:center;border-radius:20px 80px;">😊😐☹️ Sentiment Analysis</p>