# This script generated data processing for Diabetes Data Set.

1. Libraries and funtions to used
2. Load data
3. Explore completitud and accuracy data.
4. Clean data for exploration
5. Exploration data
6. Conclusions

## Libraries and funtions to used

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## Load Data

In [2]:
diabetes = pd.read_csv('../dataset_diabetes/diabetic_data.csv')

## Cleaning Nulls

In [3]:
diabetes = diabetes.replace('?', np.nan)
diabetes = diabetes.replace('Unknown/Invalid', np.nan)
diabetes = diabetes.replace('None', np.nan)

In [4]:
delete_column_nulls = ['weight','payer_code']
used_with_nulls = ['medical_specialty','max_glu_serum','A1Cresult']
simple_input_nulls=['race','diag_3','diag_2','diag_1','gender']

In [5]:
## This columns don't is posiible generated value because hava many nulls or not have intriced value
diabetes = diabetes.drop(columns=delete_column_nulls)
## This columns is posiible generated simple input for used, don't have many nulls and is simple categorical used
for i in simple_input_nulls:
    diabetes[i] = diabetes[i].fillna(diabetes[i].mode()[0])
## This columns is posible used nulls, inclusive generated value for prediction results.
diabetes[used_with_nulls]=diabetes[used_with_nulls].fillna('missing_values')

## Target column 

In [6]:
def target_process(data):
    """This funtion tranform target column in numeric column with cardinality relevance
    """
    if data == 'NO':
        data = 0
    elif data == '>30':
        data = 1
    else:
        data = 2
    return data
diabetes['readmitted']=diabetes['readmitted'].map(target_process)

## Tranformation columns

### Age

In [7]:
## the age is have cardinality. for this reason is best used numeric data
def age_range_to_number(age):
    """This funtion tranform 'age' column in numeric column with cardinality relevance
    """
    dic_age={
        '[0-10)' : 5,
    '[10-20)' : 15,
    '[20-30)' : 25, 
    '[30-40)' : 35, 
    '[40-50)' : 45, 
    '[50-60)' : 55,
    '[60-70)' : 65, 
    '[70-80)' : 75,
    '[80-90)' : 85,
    '[90-100)' : 95}
    return dic_age[age]

diabetes['age']= diabetes['age'].map(age_range_to_number)

## General clean outliers

Delete aprox 5400 rows , that have any oulier value

In [8]:
# Z score
for column in diabetes.columns:
    ## Select only float or numeric columns
    if diabetes[column].dtype in [float, int, 'float64', 'int64']:
        ## Apply Z-score
        diabetes['z_score'] = np.abs(stats.zscore(diabetes[column]))
        ## Select treshold for outlier
        diabetes = diabetes[diabetes['z_score']<3.5]
diabetes = diabetes.drop(columns=['z_score'])

In [9]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94668 entries, 1 to 101765
Data columns (total 48 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id              94668 non-null  int64 
 1   patient_nbr               94668 non-null  int64 
 2   race                      94668 non-null  object
 3   gender                    94668 non-null  object
 4   age                       94668 non-null  int64 
 5   admission_type_id         94668 non-null  int64 
 6   discharge_disposition_id  94668 non-null  int64 
 7   admission_source_id       94668 non-null  int64 
 8   time_in_hospital          94668 non-null  int64 
 9   medical_specialty         94668 non-null  object
 10  num_lab_procedures        94668 non-null  int64 
 11  num_procedures            94668 non-null  int64 
 12  num_medications           94668 non-null  int64 
 13  number_outpatient         94668 non-null  int64 
 14  number_emergency     

## Categorical oversize

diagnostics:

Every diagnostic have problems from oversize dimentionality. 

Used https://www.hindawi.com/journals/bmri/2014/781670/tab2/ for diagnostic codes tranformation

In [10]:
diag_factor= {'Circulatory':[x for x in range(390, 460)]+[785],
              'Respiratory':[x for x in range(460, 519)]+[786],
              'Digestive':[x for x in range(520, 579)]+[787],
              'Diabetes':[250],
              'Injury':[x for x in range(800, 999)],
              'Musculoskeletal':[x for x in range(710, 739)],
              'Genitourinary':[x for x in range(580, 629)],
              'Neoplasms':[x for x in range(140, 239)]+[780,781,784,790,799,782]+[x for x in range(240, 279)]+[x for x in range(1, 139)],
              }

In [11]:
def diag_category(data):
    """Simplefies range numeric in categorical name for diagnostics
    """
    result=None
    try:
        data=int(float(data))
    except:
        result='Others'
    for i in diag_factor.keys():
        if  data in diag_factor[i]:
            result = i
    if result == None:
        result = 'Others'
    return result

In [12]:
## Apply simplification to every diagnostic
diabetes['diag_1'] = diabetes['diag_1'].map(diag_category)
diabetes['diag_2'] = diabetes['diag_2'].map(diag_category)
diabetes['diag_3'] = diabetes['diag_3'].map(diag_category)


medical_specialty

Used 15 more commons specialties and remaining tranform in others.

In [13]:
# Realizar el recuento de las categorías originales
conteo_categorias = diabetes['medical_specialty'].value_counts()

# Especificar el número de categorías menos comunes que se reemplazarán por "Otros"
num_categorias_otros = 15

# Identificar las categorías menos comunes
categorias_otros = conteo_categorias.iloc[num_categorias_otros:].index
print(diabetes['medical_specialty'].replace(categorias_otros, 'Others').value_counts())

# # Reemplazar las categorías menos comunes por "Otros"
diabetes['medical_specialty'] = diabetes['medical_specialty'].replace(categorias_otros, 'Others')

missing_values                46517
InternalMedicine              13709
Emergency/Trauma               7175
Family/GeneralPractice         6948
Cardiology                     4944
Others                         4100
Surgery-General                2832
Nephrology                     1433
Orthopedics                    1385
Orthopedics-Reconstructive     1122
Radiologist                    1086
Pulmonology                     825
Psychiatry                      791
Urology                         649
ObstetricsandGynecology         616
Gastroenterology                536
Name: medical_specialty, dtype: int64


## Export processing data

In [14]:
## Export to csv file
diabetes.to_csv('../dataset_diabetes/training_diabetic_data.csv')