## 1- Training Step 

### Task 1 : `Read csv file`

In [81]:
import pandas as pd 
import numpy as np
import json

In [42]:
file = "data/soybean-app.csv"
data = pd.read_csv(file)
# data.set_index('day')

In [43]:
# list all attributes 
for e in data.columns: 
    print(e)

date
plant-stand
precip
temp
hail
crop-hist
area-damaged
severity
seed-tmt
germination
plant-growth
leaves
leafspots-halo
leafspots-marg
leafspot-size
leaf-shread
leaf-malf
leaf-mild
stem
lodging
stem-cankers
canker-lesion
fruiting-bodies
external-decay
mycelium
int-discolor
sclerotia
fruit-pods
fruit-spots
seed
mold-growth
seed-discolor
seed-size
shriveling
roots
class 		


In [44]:
# list values 
print(data)

          date plant-stand   precip  temp hail         crop-hist area-damaged  \
0       august      normal  gt-norm  norm  yes  same-lst-two-yrs    scattered   
1      october      normal  gt-norm  norm  yes  same-lst-two-yrs    scattered   
2       august      normal  gt-norm  norm   no       same-lst-yr    scattered   
3      october      normal  gt-norm  norm  yes       same-lst-yr    scattered   
4    september      normal  gt-norm  norm  yes       same-lst-yr    scattered   
..         ...         ...      ...   ...  ...               ...          ...   
450       july           ?        ?     ?    ?                 ?  whole-field   
451      april           ?        ?     ?    ?                 ?  whole-field   
452       july           ?        ?     ?    ?                 ?  upper-areas   
453    october           ?        ?     ?    ?                 ?    low-areas   
454      april           ?        ?     ?    ?                 ?    scattered   

       severity   seed-tmt 

### Task 2 :`Construction of Id3 Tree`

In [45]:
# define the Entropy function 
def calc_entropy(my_target):
    # Count the occurrences of each unique value in the target column
    value_counts = my_target.value_counts()
    
    # Calculate the probabilities of each unique value
    probabilities = value_counts / len(my_target)
    
    # Calculate entropy using the formula: -sum(p * log2(p))
    entropy_val = -sum(probabilities * np.log2(probabilities))
    
    return entropy_val

In [46]:
# define the information_gain function 
def calc_information_gain(data, split_attribute_name, target_name):
    # Calculate the entropy of the entire dataset based on the target variable
    total_entropy = calc_entropy(data[target_name])
    
    # Calculate the probabilities of each unique value of the split attribute
    unique_values, value_counts = np.unique(data[split_attribute_name], return_counts=True)
    probabilities = value_counts / len(data)
    
    # Calculate the weighted average entropy after splitting on each unique value of the split attribute
    weighted_entropies = []
    for value in unique_values:
        subset = data[data[split_attribute_name] == value]
        subset_entropy = calc_entropy(subset[target_name])
        weighted_entropies.append((len(subset) / len(data)) * subset_entropy)
    
    weighted_entropy = sum(weighted_entropies)
    
    # Step 4: Calculate the information gain
    info_gain_val = total_entropy - weighted_entropy
    
    return info_gain_val

In [47]:
def best_split(data, features, target_name):
    # Calculate information gain for each feature
    info_gain_values = [calc_information_gain(data, feature, target_name) for feature in features]
    
    # Find the index of the feature with the highest information gain
    best_feature_index = np.argmax(info_gain_values)
    
    # Get the best feature based on the index
    best_feature = features[best_feature_index]
    
    return best_feature

In [48]:
def split_data(data, split_attribute_name):
    unique_values = data[split_attribute_name].unique()
    subdatasets = {value: data[data[split_attribute_name] == value].dropna() for value in unique_values}
    return subdatasets

In [49]:
def build_tree(data, features, target_name):
    unique_classes, counts = np.unique(data[target_name], return_counts=True)
    
    # If all instances belong to the same class
    if len(unique_classes) == 1:
        return unique_classes[0]
    
    # If there are no more features to split on
    if len(features) == 0:
        return unique_classes[np.argmax(counts)]
    
    # Determine the best feature to split on
    best_feature = best_split(data, features, target_name)
    tree = {best_feature: {}}
    
    # Remove the best feature from the list of features
    remaining_features = [feature for feature in features if feature != best_feature]
    
    # Recursively build the tree for each split value
    for value, subdata in split_data(data, best_feature).items():
        tree[best_feature][value] = build_tree(subdata, remaining_features, target_name)
        
    return tree

In [50]:
def predict(query, tree):
    # Loop through each key in the query dictionary
    for key in query.keys():
        
        # Check if the key exists in the decision tree
        if key in tree:
            
            # Get the result based on the query value for the current key
            result = tree[key][query[key]]
            
            # If the result is another dictionary, recursively call predict
            if isinstance(result, dict):
                return predict(query, result)
            
            # If the result is a final outcome, return it
            else:
                return result

### Task 4 : `Displaying the ID3 Tree`

In [71]:
# Define a function in order ot convert all ID3 tree keys to strings 
def convert_keys_to_strings(dictionary):
    new_dict = {}
    
    for key, value in dictionary.items():
        # Convert numpy bool to Python bool
        if isinstance(key, np.bool_):
            key = bool(key)
        
        # Convert numpy generic type to scalar
        if isinstance(key, np.generic):
            key = np.asscalar(key)
        
        # Recursively convert nested dictionaries
        if isinstance(value, dict):
            new_dict[str(key)] = convert_keys_to_strings(value)
        else:
            new_dict[str(key)] = value
    
    return new_dict


In [72]:
# Get features and target name
features = data.columns.tolist()[:-1]  # Assuming the last column is the target
target_name = data.columns.tolist()[-1]

# Build the ID3 tree
tree = build_tree(data, features, target_name)

# Save the ID3 tree as json file
with open("soybean_id3_tree.json", "w") as outfile: 
    json.dump(convert_keys_to_strings(tree), outfile)

# # Print the ID3 tree 
print(json.dumps(convert_keys_to_strings(tree), indent=4))

{
    "fruit-spots": {
        "dna": {
            "canker-lesion": {
                "brown": {
                    "temp": {
                        "norm": "diaporthe-stem-canker",
                        "lt-norm": "rhizoctonia-root-rot"
                    }
                },
                "dna": "diaporthe-stem-canker",
                "dk-brown-blk": "phytophthora-rot",
                "tan": {
                    "int-discolor": {
                        "black": "charcoal-rot",
                        "brown": "brown-stem-rot"
                    }
                }
            }
        },
        "?": {
            "temp": {
                "norm": "phytophthora-rot",
                "gt-norm": "phytophthora-rot",
                "lt-norm": "herbicide-injury",
                "?": {
                    "crop-hist": {
                        "same-lst-sev-yrs": "cyst-nematode",
                        "same-lst-yr": "cyst-nematode",
                        "same-lst-two-y

### Task 4 : `Define the Confusion Matrix denoted as m_app`

In [77]:
def compute_confusion_matrix(data, tree, target_name):
    unique_classes = data[target_name].unique()
    num_classes = len(unique_classes)
    confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)
    
    predictions = data.apply(lambda row: predict(row, tree), axis=1)
    for i in range(len(predictions)):
        true_class = np.where(unique_classes == data.iloc[i][target_name])[0][0]
        predicted_class = np.where(unique_classes == predictions[i])[0][0]
        confusion_matrix[true_class, predicted_class] += 1
    
    return confusion_matrix

# Example usage:
# Assuming 'data' is your training dataset and 'tree' is your constructed ID3 decision tree
m_app = compute_confusion_matrix(data, tree, target_name)

print("Confusion Matrix for Training (m_app):\n", m_app)


Confusion Matrix for Training (m_app):
 [[13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 59  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 61  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 61  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 30  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 29  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0 59  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10  0  0  0]


## 2- Prediction Step 

### Task 1 : `Read soybean-pre csv file`

In [74]:
file_2 = "data/soybean-app.csv"
s_pred = pd.read_csv(file_2)
features = s_pred.columns.tolist()[:-1] 
target_name = s_pred.columns.tolist()[-1]

### Task 2 :`Predict target attribute of s _pre`

In [78]:
m_pre = compute_confusion_matrix(s_pred, tree, target_name)

print("Confusion Matrix for predicting (m_pre):\n", m_pre)

Confusion Matrix for predicting (m_pre):
 [[13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 59  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 61  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 61  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 30  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 29  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0 59  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10  0  0  0

## 3- `Upgrading ID3 tree algorithm using C4.4 approch` 

In [89]:
def info_gain_ratio(data, split_attribute_name, target_name):
    # Calculate total entropy of the dataset
    total_entropy = calc_entropy(data[target_name])
    
    # Calculate probabilities of each unique value in split_attribute_name
    _, counts = np.unique(data[split_attribute_name], return_counts=True)
    probabilities = counts / len(data)
    
    # Calculate weighted entropy for the split
    unique_values = np.unique(data[split_attribute_name])
    weighted_entropy = sum(probabilities[i] * calc_entropy(data[data[split_attribute_name] == val][target_name]) 
                           for i, val in enumerate(unique_values))
    
    # Calculate information gain
    info_gain_val = total_entropy - weighted_entropy
    
    # Calculate split information
    split_info = -np.sum(probabilities * np.log2(probabilities))
    
    # Avoid division by zero
    if split_info == 0:
        return 0
    
    # Calculate information gain ratio
    info_gain_ratio_val = info_gain_val / split_info
    
    return info_gain_ratio_val

In [90]:
# Find the best feature to split the data based on the C4.5 algorithm.
def best_split_c45(data, features, target_name):
    
    # Calculate the information gain ratio for each feature
    info_gain_ratio_values = [info_gain_ratio(data, feature, target_name) for feature in features]
    
    # Find the index of the feature with the highest information gain ratio
    best_feature_index = np.argmax(info_gain_ratio_values)
    
    # Get the name of the best feature
    best_feature = features[best_feature_index]
    
    return best_feature

In [91]:
# Build a decision tree using the C4.5 algorithm
def build_tree_c45(data, features, target_name, tree=None):
    unique_classes, class_counts = np.unique(data[target_name], return_counts=True)
    
    # If only one class is present or no features left, return the class
    if len(unique_classes) == 1 or len(features) == 0:
        return unique_classes[0]
    
    # Initialize the tree if it's None
    if tree is None:
        tree = {}
        
    # Find the best feature to split on
    best_feature = best_split_c45(data, features, target_name)
    tree[best_feature] = {}
    
    # Remove the best feature from the features list
    remaining_features = [feature for feature in features if feature != best_feature]
    
    # Recursively build subtrees
    for value in np.unique(data[best_feature]):
        sub_data = data[data[best_feature] == value]
        subtree = build_tree_c45(sub_data, remaining_features, target_name)
        tree[best_feature][value] = subtree
        
    return tree


- Apply `C4.5 Algorithm` to the my Dataset

In [92]:
features = data.columns.tolist()[:-1]  # All columns except the last one (target)
target_name = data.columns.tolist()[-1]  # Last column (target)

tree_c45 = build_tree_c45(data, features, target_name)

# Save the ID3 tree as json file
with open("soybean_c4_5_tree.json", "w") as outfile: 
    json.dump(convert_keys_to_strings(tree_c45), outfile)

# # Print the ID3 tree 
print(json.dumps(convert_keys_to_strings(tree_c45), indent=4))

NameError: name 'entropy' is not defined