<h1>1-Packages</h1>

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from IPython.display import display
from collections import Counter,deque



<h1>2-Load dataset and gain info</h1>

In [2]:
# load dataset
df = pd.read_csv('diabetic_data.csv')

#define some varibales
target_column = 'readmitted'
# Data analysis to gain insights into the dataset
display(df.head(5))
print(f"Number of rows: {len(df)},Number of columns: {len(df.columns)}")
print("--------------------")
display(df.info())

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


Number of rows: 101766,Number of columns: 50
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64

None

<h1>3-Handle missing value</h1>

In [3]:
# Separate traget value column from features

#check whether each columns has distinct value or not?
nunique_columns =  df.loc[:, df.nunique() == 1].columns
print(f"All values of these columns are the same {np.array(nunique_columns)}")

#Remove the columns that all of their values are the same
df_dropped = df.drop(np.array(nunique_columns),axis=1)

#Remove ids' column from the dataset
df_dropped = df_dropped.drop('encounter_id',axis=1)
df_dropped = df_dropped.drop('patient_nbr',axis=1)
print(f"After dropping (id columns)\nNumber of rows: {len(df_dropped)},Number of columns: {len(df_dropped.columns)}")

#replace undefined value with NaN
df_replaced = df_dropped.replace('?',np.nan)
df_replaced = df_replaced.replace('None',np.nan)

#drop columns with large number of Nan (more than 40%)
missing_columns =df_replaced.columns [((df_replaced.isnull()).sum() > 0.39 * len(df_replaced))].values
df_dropped = df_replaced.drop(missing_columns,axis=1)
print(f"\nAfter dropping (miss value column)\nNumber of rows: {len(df_dropped)},Number of columns: {len(df_dropped.columns)}")

#number of missing value in each column
nan_counts = df_dropped.isnull().sum()
print(f"\nTotal number of missing value in dataset {(nan_counts[nan_counts.index].values).sum()}")

"""
total number of missing value is 4075. it's about 0.03 of total data, 
so it can be good to remove these rows
"""
df_dropped.dropna(inplace=True)
print(f"\nAfter dropping (nan rows)\nNumber of rows: {len(df_dropped)},Number of columns: {len(df_dropped.columns)}")

# Label endoding to convert non-number columns to the number columns
pl = preprocessing.LabelEncoder()
df_encoder = df_dropped.apply(pl.fit_transform)
display(df_encoder.head())


# ---------------------------------------------------------------------------------------------
#find categorial columns

# Calculating correlation to remove some useless columns


# correlation = df_encoder.corrwith(df_encoder['readmitted'])
# print(correlation)

# print(df_dropped.nunique())
df = df_encoder.copy()

All values of these columns are the same ['examide' 'citoglipton']
After dropping (id columns)
Number of rows: 101766,Number of columns: 46

After dropping (miss value column)
Number of rows: 101766,Number of columns: 41

Total number of missing value in dataset 4075

After dropping (nan rows)
Number of rows: 98053,Number of columns: 41


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,2,0,1,0,0,6,2,58,0,17,...,0,3,1,0,0,0,0,0,1,1
2,0,0,2,0,0,6,1,10,5,12,...,0,1,1,0,0,0,0,1,1,2
3,2,1,3,0,0,6,1,43,1,15,...,0,3,1,0,0,0,0,0,1,2
4,2,1,4,0,0,6,0,50,0,7,...,0,2,1,0,0,0,0,0,1,2
5,2,1,5,1,0,1,2,30,6,15,...,0,2,1,0,0,0,0,1,1,1


<h1>4- Split dataset</h1>

In [4]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

df_train,df_test = train_test_split(df,test_size = 0.1,random_state=50)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state=50)


<h1>5- Decision tree models</h1>

In [7]:
class Node:
    def __init__(self,children, is_leaf, node_value, node_amounts):
        children = self.children
        is_leaf = self.is_leaf
        node_value = self.node_value
        node_amounts = self.node_amounts
        
def calculate_info():
    # count the repetition of each unique value
    count_unique_values = y_train.value_counts()
    
    #calculate probability of each unique value
    P_c = count_unique_values.values / len(y_train)
    info_s = -np.sum(P_c * np.log2(P_c))
    
    return info_s 



def calculate_info_feature(feature, target):
#   1- calculate info(S) according to the feature value
    column_group =[feature , target]
    # count the repetition of each unique value
    count_feature_unique_values = df_train[feature].value_counts().sort_index() #for the feature column
    count_unique_values = df_train[column_group].value_counts() # for feature and target column simultaneously
    
    """
    * calculate probabilty (P_c) According to each unique value of feature column
    
    * .index.get_level_values(feature) retrieves an array of feature values that
    each feature value is repeated based on the different unique target values it took
    """
    P_cs = (count_unique_values/count_unique_values.index.get_level_values(feature).map(count_feature_unique_values))
    info_s_is = P_cs.groupby(feature).apply(lambda x : -np.sum(x*np.log2(x)))
    
#     2- calculate info_A(S)
    info_A = np.sum((count_feature_unique_values/len(df_train[feature])*info_s_is))

    return info_A

def calculate_gain(feature, target : 'readmitted'):
    return calculate_info() - calculate_info_feature(feature, target)

    
    
# class Tree:
#     def __init__():
#         pass

# return the name of the best feature for the nex_node
def next_node(dataframe: pd.DataFrame) -> str:
    gain = 0
    max_gain = 0
    best_column = None
    for i in dataframe.columns.values:
        gain = calculate_gain(i)
        if(gain > max_gain):
            max_gain = gain
            best_column = i
    return best_column

# Checks whether a node is a leaf or not
def is_leaf(dataframe : pd.DataFrame, target :str = 'readmitted') -> bool:
#     if all remain target values are the same
    if dataframe[target].nunique() == 1:
        return True
#     if there is no more feature column to check. (==1) is for the target column
    elif (len(dataframe.columns) == 1):
        return True
    else:
        return False

# return appropriate value for a leaf
def leaf_value(target_column : pd.Series)->int:
    return target_column.value_counts().idxmax()
    

def generate_tree():
    columns_name = X_train.columns.values
    root = None
    not_visited_nodes = deque()
    dataframe_copy = df_train.copy()
    generate_tree1(root, dataframe_copy)
    return columns_name
    
    
def generate_tree1(node,dataframe): 
    
    for node_amount in node.amounts:
    #       get the values of all columns those are according to the specific amount of the current node (path)
        temp_dataframe = dataframe.loc[dataframe[node.value] == node_amount]

    #       Checks whether this path ends in a leaf or not
        if(is_leaf(temp_dataframe)):
            new_child = Node(is_leaf=True, value = leaf_value(temp_dataframe[target_column]))
            node.children[node_amount] = newChild
            continue
        next_node_name = next_node(dataframe)
        newNode =  Node(value=next_node_name)
        node.children[node_amount] = newNode
        not_visited_nodes.append(newNode)
        dataframe.drop[next_node_name, axis = 1] 
    generate_tree1(not_visited_nodes.popleft())
    
    return ...
    
    
# print(type(df_train('age')['readmitted']))

print(type(df_train.loc[df_train['age'] == 2]))



<class 'pandas.core.frame.DataFrame'>
1.3697322140094272


In [None]:
info_s = calculate_info() 
# z = X['race'].value_counts()
# print(z)
# temp = ['race','gender']
# cn = X[temp].value_counts()
# # print(cn.groupby('race').sum())
# print(cn/cn.index.get_level_values('race').map(z))
