# Tema ID3-Bodescu Stefan-Rares
Link Github(pentru a descarca setul de date): https://github.com/Stefan1811/ML_ID3_DATASET

# Preprocessing
1. My dataset contains attributes with data from certain people, data that can determine if that person is prone to a heart attack.
   
   Dataset attributes:
   * age
   * sex
   * chest pain type(cp)
   * resting blood pressure(trestbps)
   * serum cholestoral in mg/dl(chol)
   * fasting blood sugar > 120 mg/dl(fbs)
   * resting electrocardiographic results(restecg)
   * maximum heart rate achieved(thalach)
   * exercise induced angina(exang)
   * oldpeak = ST depression induced by exercise relative to rest(oldpeak)
   * the slope of the peak exercise ST segment(slope)
   * number of major vessels (0-3) colored by flourosopy(ca)
   * thal: 0 = normal; 1 = fixed defect; 2 = reversable defect(thal)

   The target attribute of the data set is "target".
   The purpose of the data set is to determine if, depending on the above attributes, a person is prone to a heart attack.
   
   The discrete attributes of the dataset:
   * sex(0,1)
   * cp((0,1,2,3)
   * fbs(0,1)
   * restecg(0,1,2)
   * exang(0,1)
   * slope(0,1,2)
   * ca(0,1,2,3,4)
   * thal(0,1,2,3)
   * target(0,1)

   The continuous attributes of the data set:
   * age(29-77)
   * trestbps(94-220)
   * chol(126-564)
   * thalach(71-202)
   * oldpeak(0,6.2)


2. My dataset doesn't contain NaN values.

3. 

   

In [1]:
import pandas as pd
import numpy as np
import math as mt
import json
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()

mean_values = train_data.mean()
variance_values = train_data.var()

print("Mean values:\n", mean_values)
print("\nVariance values:\n", variance_values)

Mean values:
 age          54.366337
sex           0.683168
cp            0.966997
trestbps    131.623762
chol        246.264026
fbs           0.148515
restecg       0.528053
thalach     149.646865
exang         0.326733
oldpeak       1.039604
slope         1.399340
ca            0.729373
thal          2.313531
target        0.544554
dtype: float64

Variance values:
 age           82.484558
sex            0.217166
cp             1.065132
trestbps     307.586453
chol        2686.426748
fbs            0.126877
restecg        0.276528
thalach      524.646406
exang          0.220707
oldpeak        1.348095
slope          0.379735
ca             1.045724
thal           0.374883
target         0.248836
dtype: float64



# Probabilities, Information Theory

1. 

In [2]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()
def compute_probabilities(train_data, attribute):
    prob = train_data[attribute].value_counts(normalize=True)
    return prob

print(compute_probabilities(train_data,"target"))

target
1    0.544554
0    0.455446
Name: proportion, dtype: float64



2.

In [3]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()
def calculate_entropy(train_data,attribute):
    entropy_result=0
    attribute_pmf=compute_probabilities(train_data,attribute)
    probs=attribute_pmf.values
    probs_list=probs.tolist()
    for prob in probs_list:
        entropy_el = - prob*np.log2(prob)
        entropy_result+=entropy_el
    return entropy_result

print(calculate_entropy(train_data,"target"))

0.994264609261905



3. 

In [4]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()
#H(TARGET_ATTRIBUTE|ATTRIBUTE)
def calculate_conditional_entropy(train_data,target_attribute,attribute):
    attribute_pmf = compute_probabilities(train_data,attribute)
    conditional_entropy = 0
    for value in attribute_pmf.index:
        subset = train_data[train_data[attribute] == value]
        subset_probability = attribute_pmf[value]
        subset_entropy = calculate_entropy(subset,target_attribute)
        conditional_entropy += subset_probability * subset_entropy

    return conditional_entropy

print(calculate_conditional_entropy(train_data,"target","cp"))

0.7896657536419621



4.

In [5]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()
def calculate_information_gain(train_data,target_attribute,attribute):
    return calculate_entropy(train_data,target_attribute)-calculate_conditional_entropy(train_data,target_attribute,attribute)
print(calculate_information_gain(train_data,"target","cp"))

0.20459885561994284



# ID3

1.

In [6]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()
target_attribute = 'target'
discrete_attributes = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
def find_root_node(train_data,discrete_list,target):
    max_information_gain=0
    maxim_attribute='target'
    for attribute in discrete_list:
        information_gain=calculate_information_gain(train_data,target,attribute)
        if max_information_gain<=information_gain:
            max_information_gain=information_gain
            maxim_attribute=attribute
    return (maxim_attribute,max_information_gain)

print(find_root_node(train_data,discrete_attributes,'target'))

('thal', 0.21325070203885554)



2,3. 

In [7]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()
target_attribute = 'target'
discrete_attributes = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
def id3_discrete(train_data, attributes, target_attribute, parent_node_class=None):
    if len(train_data[target_attribute].unique()) == 1:
        return train_data[target_attribute].iloc[0]
    if len(attributes) == 0:
        return parent_node_class
    best_feature, info_gain = find_root_node(train_data, attributes, target_attribute)
    tree = {"node_attribute": best_feature, "n_observations": dict(train_data[target_attribute].value_counts()), "information_gain": info_gain}
    for value in train_data[best_feature].unique():
        subset = train_data[train_data[best_feature] == value]
        remaining_features = attributes.copy()
        remaining_features.remove(best_feature)
        subtree = id3_discrete(subset, remaining_features, target_attribute, train_data[target_attribute].mode().iloc[0])
        tree["values"] = tree.get("values", {})
        tree["values"][value] = subtree
    return tree
    
def convert_np_int64(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    elif isinstance(obj, dict):
        return {convert_np_int64(key): convert_np_int64(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_np_int64(element) for element in obj]
    return obj

def write_tree_to_file(tree, file_path):
    tree_converted = convert_np_int64(tree)
    with open(file_path, 'w') as file:
        json.dump(tree_converted, file, indent=4)
decision_tree = id3_discrete(train_data, discrete_attributes, target_attribute)
write_tree_to_file(decision_tree,"id3_discrete.json")

with open('id3_discrete.json', 'r') as fisier:
    data = json.load(fisier)
print(json.dumps(data, indent=2))

{
  "node_attribute": "thal",
  "n_observations": {
    "1": 165,
    "0": 138
  },
  "information_gain": 0.21325070203885554,
  "values": {
    "2": {
      "node_attribute": "ca",
      "n_observations": {
        "1": 130,
        "0": 36
      },
      "information_gain": 0.142251434061765,
      "values": {
        "2": {
          "node_attribute": "slope",
          "n_observations": {
            "0": 7,
            "1": 7
          },
          "information_gain": 0.27858103784929467,
          "values": {
            "1": {
              "node_attribute": "exang",
              "n_observations": {
                "0": 4,
                "1": 1
              },
              "information_gain": 0.3219280948873623,
              "values": {
                "1": 0,
                "0": {
                  "node_attribute": "cp",
                  "n_observations": {
                    "1": 1,
                    "0": 1
                  },
                  "information_gain": 


4.

In [8]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()
def get_splits(continuous_attribute,labels):
    train_data = pd.DataFrame({"Attribute": continuous_attribute, "Label": labels})
    train_data = train_data.sort_values(by="Attribute")
    splits=[]
    for value in range(1,len(train_data)):
        if train_data["Label"].iloc[value] != train_data["Label"].iloc[value - 1]:
            split_point = (train_data["Attribute"].iloc[value] + train_data["Attribute"].iloc[value - 1]) / 2.0
            splits.append(split_point)
    return splits

print(get_splits(train_data["age"],train_data["target"]))

[35.0, 36.0, 37.5, 38.0, 38.5, 39.0, 39.5, 40.0, 41.0, 41.0, 42.0, 42.0, 43.0, 43.0, 43.0, 43.0, 43.0, 43.0, 43.5, 44.0, 44.0, 44.0, 44.0, 44.0, 44.5, 45.0, 45.0, 46.0, 46.0, 46.0, 46.0, 47.0, 47.0, 47.0, 47.5, 48.0, 48.0, 48.0, 48.5, 49.0, 49.0, 50.0, 50.0, 50.0, 51.0, 51.0, 51.0, 51.0, 51.0, 51.0, 52.0, 52.0, 52.0, 52.0, 52.0, 52.5, 53.0, 53.5, 54.0, 54.0, 54.0, 54.0, 54.0, 55.0, 55.0, 55.0, 55.0, 55.0, 56.0, 56.0, 56.0, 56.0, 56.0, 57.0, 57.0, 57.0, 57.0, 57.0, 57.0, 57.0, 57.0, 57.0, 57.0, 57.0, 57.5, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 59.0, 59.0, 59.0, 59.0, 60.0, 60.0, 60.0, 60.0, 61.0, 61.0, 62.0, 62.0, 62.0, 62.0, 62.0, 62.0, 62.0, 62.5, 63.0, 63.0, 63.0, 63.0, 63.0, 64.0, 64.0, 64.0, 64.0, 64.5, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 66.0, 66.0, 66.0, 66.0, 66.5, 67.0, 67.0, 67.0, 67.0, 67.0, 68.0, 68.0, 69.0, 69.5, 70.0, 76.5]



5.

In [9]:
import pandas as pd
import numpy as np
import math as mt
import json

train_data = pd.read_csv("Heart_attack.csv")
train_data = train_data.dropna()

target_attribute = 'target'
discrete_attributes = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
continuous_attributes=['age','trestbps','chol','thalach','oldpeak']

def calculate_information_gain_discrete_continous(train_data, target_attribute, attribute, subset1, subset2):
    total_entropy = calculate_entropy(train_data, target_attribute)

    prob1 = len(subset1) / len(train_data)
    prob2 = len(subset2) / len(train_data)

    entropy_subset1 = calculate_entropy(subset1, target_attribute)
    entropy_subset2 = calculate_entropy(subset2, target_attribute)
    return total_entropy - (prob1 * entropy_subset1 + prob2 * entropy_subset2)

def get_splits(continuous_attribute,labels):
    train_data = pd.DataFrame({"Attribute": continuous_attribute, "Label": labels})
    train_data = train_data.sort_values(by="Attribute")
    splits=[]
    for value in range(1,len(train_data)):
        if train_data["Label"].iloc[value] != train_data["Label"].iloc[value - 1]:
            split_point = (train_data["Attribute"].iloc[value] + train_data["Attribute"].iloc[value - 1]) / 2.0
            splits.append(split_point)
    return splits

def find_best_split_point(train_data, target_attribute, attribute, splits):

    max_info_gain = -1
    best_split_point = None

    for split_point in splits:
        subset1 = train_data[train_data[attribute] <= split_point]
        subset2 = train_data[train_data[attribute] > split_point]

        info_gain = calculate_information_gain_discrete_continous(train_data, target_attribute, attribute, subset1, subset2)

        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_split_point = split_point

    return best_split_point, max_info_gain

def find_root_node_dc(train_data, discrete_attributes, continuous_attributes, target_attribute):
    best_feature = None
    split_point = None
    max_info_gain = -1

    for attribute in discrete_attributes:
        info_gain = calculate_information_gain(train_data, target_attribute, attribute)

        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = attribute
            split_point = None

    for attribute in continuous_attributes:
        splits = get_splits(train_data[attribute], train_data[target_attribute])
        current_split_point, info_gain = find_best_split_point(train_data, target_attribute, attribute, splits)

        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = attribute
            split_point = current_split_point

    return best_feature, split_point, max_info_gain
    
def id3(train_data,discrete_attributes,continuous_attributes,target_attribute,parent_node_class=None):
    if len(train_data[target_attribute].unique()) == 1:
        return train_data[target_attribute].iloc[0]
    if len(discrete_attributes) == 0 and len(continuous_attributes) == 0:
        return parent_node_class
    best_feature, split_point, info_gain = find_root_node_dc(train_data, discrete_attributes,continuous_attributes,target_attribute)
    tree = {"node_attribute": best_feature, "n_observations": dict(train_data[target_attribute].value_counts()), "information_gain": info_gain}
    if split_point is not None:
        subset1 = train_data[train_data[best_feature] <= split_point]
        subset2 = train_data[train_data[best_feature] > split_point]

        remaining_discrete_attributes = discrete_attributes.copy()
        remaining_continuous_attributes = continuous_attributes.copy()

        if best_feature in remaining_discrete_attributes:
            remaining_discrete_attributes.remove(best_feature)
        elif best_feature in remaining_continuous_attributes:
            remaining_continuous_attributes.remove(best_feature)

        tree["values"] = {
            f"less_than_or_equal_to {split_point}": id3(subset1, remaining_discrete_attributes, remaining_continuous_attributes, target_attribute, train_data[target_attribute].mode().iloc[0]),
            f"greater_than {split_point}": id3(subset2, remaining_discrete_attributes, remaining_continuous_attributes, target_attribute, train_data[target_attribute].mode().iloc[0])
        }
    else:
        for value in train_data[best_feature].unique():
            subset = train_data[train_data[best_feature] == value]
            remaining_discrete_attributes = discrete_attributes.copy()
            remaining_continuous_attributes = continuous_attributes.copy()

            if best_feature in remaining_discrete_attributes:
                remaining_discrete_attributes.remove(best_feature)
            elif best_feature in remaining_continuous_attributes:
                remaining_continuous_attributes.remove(best_feature)

            subtree = id3(subset, remaining_discrete_attributes, remaining_continuous_attributes, target_attribute, train_data[target_attribute].mode().iloc[0])
            tree["values"] = tree.get("values", {})
            tree["values"][value] = subtree

    return tree
    

decision_tree = id3(train_data, discrete_attributes, continuous_attributes, target_attribute)
def convert_np_int64(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    elif isinstance(obj, dict):
        return {convert_np_int64(key): convert_np_int64(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_np_int64(element) for element in obj]
    return obj

def write_tree_to_file(tree, file_path):
    tree_converted = convert_np_int64(tree)
    with open(file_path, 'w') as file:
        json.dump(tree_converted, file, indent=4)
        

write_tree_to_file(decision_tree,"id3.json")

with open('id3.json', 'r') as fisier:
    data = json.load(fisier)
print(json.dumps(data, indent=2))


{
  "node_attribute": "thal",
  "n_observations": {
    "1": 165,
    "0": 138
  },
  "information_gain": 0.21325070203885554,
  "values": {
    "2": {
      "node_attribute": "ca",
      "n_observations": {
        "1": 130,
        "0": 36
      },
      "information_gain": 0.142251434061765,
      "values": {
        "2": {
          "node_attribute": "slope",
          "n_observations": {
            "0": 7,
            "1": 7
          },
          "information_gain": 0.27858103784929467,
          "values": {
            "1": {
              "node_attribute": "chol",
              "n_observations": {
                "0": 4,
                "1": 1
              },
              "information_gain": 0.7219280948873623,
              "values": {
                "less_than_or_equal_to 301.0": 0,
                "greater_than 301.0": 1
              }
            },
            "2": {
              "node_attribute": "age",
              "n_observations": {
                "1": 6,
     