# Libraries

In [1]:
import time
import pandas as pd
import numpy as np
import re

import json
import os

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px
import plotly.graph_objs as go

# remove verison errors
import warnings
warnings.filterwarnings('ignore')

In [None]:
# # Select Palette
# fig = px.colors.sequential.swatches_continuous()
# fig.show()

# Import CSV

## Raw data

In [None]:
path = 'repository/data_prep/Appendicitis.csv'
knn_path = 'repository/data_prep/knn_Appendicitis.csv'
iter_path = 'repository/data_prep/iter_Appendicitis.csv'
# ---
df = pd.read_csv(path)
knn_df_imputed = pd.read_csv(knn_path)
interative_df_imputed = pd.read_csv(iter_path)

In [None]:
df_dict = {'Original': df,'KNN': knn_df_imputed,'IterativeImputer':interative_df_imputed}

In [None]:
df

# Feature Portability
**Obj**: Grouped some feature easy to EDA

In [None]:
df["age"].describe()

In [None]:
df["AgeGroup"] = pd.cut(df["age"], bins=[0, 10, 20, 30,
                               40], labels=["< 10", "11 - 20", "21 - 30", "31 - 40"])
df['AgeGroup']

In [None]:
age_group = {'< 10': 0, '11 - 20': 1, '21 - 30': 2, '31 - 40': 3}
df['AgeGroup'] = df['AgeGroup'].map(age_group)
df['AgeGroup']

# Exploratory Data Analysis
Obj: 
- identifying and dealing with missing values, outliers, and inconsistencies in the dataset.
- identifying Patterns and Relationships, such as The higher WBC can distinguish between disease and no disease
- identifying the most informative variables for modeling

## 1. Pairgrid
**Obj** : 
- **Relationships**: the relationship between each pair of variables.
- **Correlations**: Linearity each pair of variables.
- **Outliers** and etc.

### Original

In [None]:
g = sns.PairGrid(df[['WBC', 'RBC', 'Neutrophil',
                                 'Lymphocytes', 'Urine WBC', 'Urine RBC', 'Leukocytes', 'Urine Ketone',
                                 'Urine Color', 'Urine Sugar',
                                'Target']], hue='Target')
g.map_upper(sns.kdeplot)
g.map_lower(sns.kdeplot, fill=True)
g.map_diag(sns.histplot, kde=True)

### KNNImputer

In [None]:
g = sns.PairGrid(knn_df_imputed[['WBC', 'RBC', 'Neutrophil',
                                 'Lymphocytes', 'Urine WBC', 'Urine RBC', 'Leukocytes', 'Urine Ketone',
                                 'Urine Color', 'Urine Sugar',
                                'Target']], hue='Target')
g.map_upper(sns.kdeplot)
g.map_lower(sns.kdeplot, fill=True)
g.map_diag(sns.histplot, kde=True)


### IterativeImputer

In [None]:
g = sns.PairGrid(interative_df_imputed[['WBC', 'RBC', 'Neutrophil',
                                 'Lymphocytes', 'Urine WBC', 'Urine RBC', 'Leukocytes', 'Urine Ketone',
                                 'Urine Color', 'Urine Sugar',
                                'Target']], hue='Target')
g.map_upper(sns.kdeplot)
g.map_lower(sns.kdeplot, fill=True)
g.map_diag(sns.histplot, kde=True)

## 2. Age
Appendicitis is among the commonest **childhood** diseases, between 10 and 19 years of age.

In [None]:
plt.figure(figsize=(15, 9))
ax = sns.kdeplot(
    data=df, x='age', hue="Target",
    fill=True, common_norm=False,
    alpha=.5, linewidth=0,
)

ax.set_title(r'$\bf{' + 'AgeGroup' + '}$' +
             ' for Appendicitis Population')
ax.legend(['Disease', 'No-disease'])
ax.set(xlabel='Age Range')

plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x="AgeGroup", y="age", hue="Target", alpha=0.2)
plt.legend(title="Target", loc="upper right")
plt.title('Range of Age\n'+'\n'.join(age_group.keys()))
plt.legend(['Disease', 'No Disease'])
plt.xlabel("Group")
plt.ylabel("Age")


## 3. WBC Differential
| Variable | Full form | TH | Normal range | Indicates the condition / บ่งบอกสภาวะ | Differential |
| --- | --- | --- | --- | --- | --- |
| WBC | White blood cell | ค่าปริมาณเซลล์เม็ดเลือดขาว  | 4,500 – 11,000 cells/mm³ | High WBC Count (Leukocytosis)/ Low WBC Count (Leukopenia) / การติดเชื้อแบคทีเรียม (Bacteria) / การต่อต้านเชื้อไวรัส (Virus) / ภาวะอักเสบเรื้อรัง (Chronic Inflammation) / ภาวะภูมิแพ้ (Allergy) | Neutrophil 50%-70% / Lymphocyte 20%-40% / Monocyte 0%-7% / Eosinophil 0%-5% / Basophil 0%-1% |

| WBC | Neutrophil | Lymphocytes | Leukocytes |
| --- | --- | --- | --- |

In [None]:
features = ['WBC', 'Neutrophil', 'Lymphocytes', 'Leukocytes']
n_rows = 1
n_cols = 4
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(20, 4*n_rows))

for i, col in enumerate(features):
    sns.scatterplot(data=df, x='WBC', y=col, hue='Target',
                    palette='viridis', ax=axes[i])

    axes[i].set_xlabel('WBC')
    axes[i].set_ylabel(col)
    axes[i].legend(['Disease', 'No Disease'])
    axes[i].set_title(f'WBC Differential\nWBC / {col}')

plt.tight_layout()
plt.show()


## 4. Alvarado Score (AS) and Pediatric appendicitis score (PAS) on Disease / No Disease

In [None]:
related_cols = ['Migration of pain','Anorexia','Urine Ketone','Nausea/vomiting','Tenderness in right lower quadrant','Rebound tenderness','WBC','Neutrophil']

### 4.1 Relation between(AS, PAS) and (Related variables)


In [None]:
for col in related_cols:
    fig, axes = plt.subplots(1, 3, figsize=(15, 8))
    # Orignal
    axes[0].set_title(r'$\bf{' + list(df_dict.keys())[0] + '}$')
    sns.kdeplot(ax=axes[0], data=list(df_dict.values())[0], x='Alvarado Score (AS)',
                y=col, fill=True, hue="Target")
    axes[0].legend(['Disease', 'No Disease'])
    # Mean
    axes[1].set_title(r'$\bf{' + list(df_dict.keys())[1] + '}$')
    sns.kdeplot(ax=axes[1], data=list(df_dict.values())[1], x='Pediatric appendicitis score (PAS)',
                y=col, fill=True, hue="Target")
    axes[1].legend(['Disease', 'No Disease'])
    # KNN
    axes[2].set_title(r'$\bf{' + list(df_dict.keys())[2] + '}$')
    sns.kdeplot(ax=axes[2], data=list(df_dict.values())[2], x='Alvarado Score (AS)',
                y=col, fill=True, hue="Target")
    axes[2].legend(['Disease', 'No Disease'])
    plt.show()


### 4.2 AS vs PAS

In [None]:
sns.kdeplot(data=df, x='Pediatric appendicitis score (PAS)', y='Alvarado Score (AS)',
            fill=True, hue='Target').set(title=r'$\bf{' + 'Original' + '}$')

# 5. Correlation

## 5.1 Pearson Correlation 
Full features 

In [None]:
mask = np.zeros_like(df.corr(), dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(30, 18))
plt.title('Pearson Correlation Matrix',fontsize=20)
# Purples_r
sns.heatmap(df.corr(),linewidths=0,vmax=0.7,square=True,cmap='BuGn', 
            linecolor='w',annot=True,annot_kws={"size":10},mask=mask,cbar_kws={"shrink": .9})

## 5.2 Pearson Correlation
on Target 

In [None]:
fig, axes = plt.subplots(figsize=(15, 8))
fig.suptitle('Pearson Correlation Matrix', fontsize=20, fontweight='bold')

corr_matrix = df.corr()
# Df section
target_corr = corr_matrix['Target']
other_corr = corr_matrix.drop('Target')['Target']
corr_df = pd.DataFrame({'Target': target_corr})
corr_df = corr_df.reindex(
    corr_df['Target'].abs().sort_values(ascending=False).index)
# Graph Section

axes.set_title(r'$\bf{' + 'Original' + '}$' +
               f'\nAppendicitis')
sns.heatmap(corr_df, cmap="YlGnBu", annot=True, ax=axes)
plt.vlines(x=1, ymin=0, ymax=len(corr_df), colors='r', linewidth=2)
plt.show()


## 6. Parallel Plots
**Obj**:  to visualize and compare relationship between features and classes, identify discriminative features.

#### Create a subset of related variables on **Target**

In [None]:
as_cols = ['Migration of pain',
            'Anorexia',
            'Urine Ketone',
            'Nausea/vomiting',
            'Tenderness in right lower quadrant',
            'Rebound tenderness',
            'WBC',
            'Neutrophil', 
            # Target
            'Target']


In [None]:
clinical_cols = ['Peritonitis/abdominal guarding',
                 'Migration of pain', 'Tenderness in right lower quadrant',
                 'Rebound tenderness', 'Cough tenderness', 'Nausea/vomiting', 'Anorexia',
                 'Dysuria', 'Stool'
                 # Target
                 'Target'
                 ]

In [None]:
lab_cols = ['WBC', 'RBC', 'Neutrophil',
'Lymphocytes', 'Urine WBC', 'Urine RBC', 'Leukocytes', 'Urine Ketone',
'Urine Color', 'Urine Sugar', 'Urine Leukocytes',
'Urine Specific gravity', 'Target']


In [None]:
# # Select colors for parallel_coordinates
# fig = px.colors.sequential.swatches_continuous()
# fig.show()

### 0 = No disease, 1 = Disease

In [None]:
fig = px.parallel_coordinates(df, color='Target', dimensions=lab_cols,
                              color_continuous_scale=['blue', 'red'],
                              title="Appendicitis Coordinates Plot")
fig.show()