<a href="https://colab.research.google.com/github/SanjanaSuresh30/ML_LAB_1BM22CS239/blob/main/1BM22CS239_ML_LAB2_ID3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import math
from collections import Counter

# Load dataset from CSV
df = pd.read_csv("/content/id3.csv")

# Ensure there are no missing values
df.dropna(inplace=True)

def entropy(data):
    labels = data['label'].tolist()
    counts = Counter(labels)
    probabilities = [count / len(labels) for count in counts.values()]
    entropy_value = -sum(p * math.log2(p) for p in probabilities if p > 0)  # Avoid log(0)
    return entropy_value

def gain(data, feature):
    initial_entropy = entropy(data)
    feature_values = data[feature].unique()
    weighted_entropy = 0
    for value in feature_values:
        subset = data[data[feature] == value]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset)
    return initial_entropy - weighted_entropy

def id3(data, features, target_attribute):
    # If all labels are the same, return that label
    if len(data[target_attribute].unique()) == 1:
        return data[target_attribute].iloc[0]

    # If no features left, return the most common label
    if len(features) == 0:
        return data[target_attribute].mode()[0]

    # Select the best feature using Information Gain
    best_feature = max(features, key=lambda feature: gain(data, feature))
    tree = {best_feature: {}}
    features = [f for f in features if f != best_feature]

    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value].drop(columns=[best_feature])
        if subset.empty:
            tree[best_feature][value] = data[target_attribute].mode()[0]  # Handle empty subsets
        else:
            tree[best_feature][value] = id3(subset, features, target_attribute)

    return tree

# Extract feature names and target attribute
target_attribute = "label"
features = [col for col in df.columns if col != target_attribute]

# Create the ID3 decision tree
decision_tree = id3(df, features, target_attribute)

# Print the decision tree
print(decision_tree)


{'outlook': {'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}, 'overcast': 'yes', 'rainy': {'wind': {'weak': 'yes', 'strong': 'no'}}}}


In [4]:
import pandas as pd

# Sample dataset
data = {'outlook': ['sunny', 'sunny', 'overcast', 'rainy', 'rainy', 'rainy', 'overcast', 'sunny', 'sunny', 'rainy', 'sunny', 'overcast', 'overcast', 'rainy'],
        'temperature': ['hot', 'hot', 'hot', 'mild', 'cool', 'cool', 'cool', 'mild', 'cool', 'mild', 'mild', 'mild', 'hot', 'mild'],
        'humidity': ['high', 'high', 'high', 'high', 'normal', 'normal', 'normal', 'high', 'normal', 'normal', 'normal', 'high', 'normal', 'high'],
        'wind': ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'strong'],
        'label': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("dataset.csv", index=False)

print("CSV file 'dataset.csv' created successfully.")


CSV file 'dataset.csv' created successfully.
