# Projet ML: 
#####                            Ovia Chanemouganandam et Sandrine Daniel - DIA 4

In [328]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import os

#### Step 1:
1. Import dataset

In [329]:
df =pd.read_csv('data/diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


2. Number of columns and rows

In [330]:
print("Number of columns:", df.shape[1])
print("Number of rows:", df.shape[0])

Number of columns: 50
Number of rows: 101766


3.  Variable Definition


The dataset contains information about patients and their medical history. The target variable is **`readmitted`**, which indicates whether a patient was readmitted to the hospital. The dataset has the following columns:

| Feature | Type | Description |
|---------|------|-------------|
| `encounter_id` | Numeric / ID | Unique identifier for an encounter |
| `patient_id` | Numeric / ID | Unique identifier for each patient |
| `race` | Categorical | 	Values: Caucasian, Asian, African American, Hispanic, and other |
| `admission_type` | Categorical | Type of hospital admission (e.g., Emergency, Elective) |
| `admission_source_id` | Categorical | Integer identifier corresponding to 21 distinct values, for example, physician referral, emergency room, and transfer from a hospital|
| `discharge_disposition` | Categorical | How the patient was discharged (e.g., home, transferred) |
| `weight` | Categorical | Weight in pounds |
| `age` | Numeric | Age of the patient in years |
| `gender` | Categorical | Gender of the patient (Male/Female) |
| `num_lab_procedures` | Numeric | Number of lab tests performed during the stay |
| `num_medications` | Numeric | Number of medications prescribed during the stay |
| `time_in_hospital` | Numeric | Number of days spent in the hospital |
| `payer_code` |  Categorical | Integer identifier corresponding to 23 distinct values, for example, Blue Cross/Blue Shield, Medicare, and self-pay|
| `Medical_specialty`| Categorical | Integer identifier of a specialty of the admitting physician, corresponding to 84 distinct value|
| `diag_1` | Categorical | Primary diagnosis code |
| `diag_2` | Categorical | Secondary diagnosis code |
| `diag_3` | Categorical | Tertiary diagnosis code |
| `num_procedures` | Numeric | Number of procedures performed during the stay |
| `num_outpatient` | Numeric | Number of outpatient visits in the past year |
| `num_emergency` | Numeric | Number of emergency visits in the past year |
| `number_diagnoses` | Numeric | Number of diagnoses entered to the system|
| `max_glu_serum`| Categorical | Indicates the range of the result or if the test was not taken. Values: >200, >300, normal, and none if not measured |
| `A1Cresult` | Categorical | Indicates the range of the result or if the test was not taken. Values: >8 if the result was greater than 8%, >7 if the result was greater than 7% but less than 8%, normal if the result was less than 7%, and none if not measured|
| `For the 23 columns ex: metformin to -> metformin - pioglitazone`  | Categorical | The feature indicates whether the drug was prescribed or there was a change in the dosage |
| `num_inpatient` | Numeric | Number of prior inpatient visits in the past year |
| `change`| Categorical | Indicates if there was a change in diabetic medications (either dosage or generic name). Values: change and no change |
| `diabetesMed` | Categorical | Indicates if there was any diabetic medication prescribed. Values: yes and no |
| `readmitted` | Categorical | Target variable: 
  - `0` = Not readmitted  
  - `1` = Readmitted in less than 30 days  
  - `2` = Readmitted after 30 days or more |





In [331]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

4. Handling missing values

In [332]:
print("Number of missing values per column:")
print(df.isna().sum())

Number of missing values per column:
encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride

During the visualization of the dataset we noticed that some columns had '?' instead of NaN, we will now replace them with proper missing values

In [333]:
print("Number of missing values after replacing '?' with nAn:")
df.replace('?', np.nan, inplace=True)
print(df.isna().sum())

Number of missing values after replacing '?' with nAn:
encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide             

The colulmns 'weight', 'payer_code', 'medical_specialty' have more than 50% of missing values, we will therefore drop these columns

In [334]:
df.drop(columns=['weight', 'payer_code', 'medical_specialty'], inplace=True)
df=df.dropna(subset=['race'])
print(df.isna().sum())

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         19
diag_2                        336
diag_3                       1349
number_diagnoses                0
max_glu_serum               94203
A1Cresult                   82897
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide                       0
tolbutamide   

In [335]:
print("Number of columns after cleaning:", df.shape[1])
print("Number of rows after cleaning:", df.shape[0])

Number of columns after cleaning: 47
Number of rows after cleaning: 99493


5. Handling duplicates

In [336]:
print(df.duplicated().sum())
print(df.shape)

0
(99493, 47)


There is no duplicates!