<a href="https://colab.research.google.com/github/Pallavi20004/level2/blob/main/Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Mild', 'Mild', 'Cool', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak'],
    'Play Tennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes']
}

# Load the dataset into a DataFrame
train_data_m = pd.DataFrame(data)
train_data_m.head()



Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [None]:


# 2. Calculate Entropy for entire dataset
def calc_total_entropy(train_data, label, class_list):
    total_row = train_data.shape[0]
    total_entr = 0
    for c in class_list:
        total_class_count = train_data[train_data[label] == c].shape[0]
        if total_class_count == 0:
            continue
        total_class_entr = - (total_class_count / total_row) * np.log2(total_class_count / total_row)
        total_entr += total_class_entr
    return total_entr

# 3. Calculate Entropy of all Features
def calc_entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]
        if label_class_count == 0:
            continue
        probability_class = label_class_count / class_count
        entropy_class = - probability_class * np.log2(probability_class)
        entropy += entropy_class
    return entropy

# 4. Calculate Information Gain for a feature
def calc_info_gain(feature_name, train_data, label, class_list):
    feature_value_list = train_data[feature_name].unique()
    total_row = train_data.shape[0]
    feature_info = 0.0
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list)
        feature_value_probability = feature_value_count / total_row
        feature_info += feature_value_probability * feature_value_entropy
    total_entropy = calc_total_entropy(train_data, label, class_list)
    info_gain = total_entropy - feature_info
    return info_gain

# 5. Find the most informative feature
def find_most_informative_feature(train_data, label, class_list):
    feature_list = train_data.columns.drop(label)
    max_info_gain = -1
    max_info_feature = None
    for feature in feature_list:
        feature_info_gain = calc_info_gain(feature, train_data, label, class_list)
        if max_info_gain < feature_info_gain:
            max_info_gain = feature_info_gain
            max_info_feature = feature
    return max_info_feature

# 6. Generate the root node (most informative feature)
def generate_sub_tree(feature_name, train_data, label, class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False)
    tree = {}
    for feature_value, count in feature_value_count_dict.items():
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        assigned_to_node = False
        for c in class_list:
            class_count = feature_value_data[feature_value_data[label] == c].shape[0]
            if class_count == count:
                tree[feature_value] = c
                assigned_to_node = True
        if not assigned_to_node:
            tree[feature_value] = "?"
    return tree

# 7. Generate Tree recursively
def make_tree(root, prev_feature_value, train_data, label, class_list):
    if train_data.shape[0] == 0:  # Base case: No more data
        return
    max_info_feature = find_most_informative_feature(train_data, label, class_list)
    if max_info_feature is None:  # Base case: No informative features left
        return
    tree = generate_sub_tree(max_info_feature, train_data, label, class_list)

    if prev_feature_value is not None:
        root[prev_feature_value] = {max_info_feature: tree}
    else:
        root[max_info_feature] = tree

    for feature_value, sub_tree in tree.items():
        if sub_tree == "?":
            feature_value_data = train_data[train_data[max_info_feature] == feature_value]
            make_tree(tree, feature_value, feature_value_data, label, class_list)

# 8. ID3 algorithm
def id3(train_data_m, label):
    train_data = train_data_m.copy()
    tree = {}
    class_list = train_data[label].unique()
    make_tree(tree, None, train_data, label, class_list)
    return tree

# 9. Predicting from the tree
def predict(tree, instance):
    if not isinstance(tree, dict):  # if it is a leaf node
        return tree
    if not tree:
        return None
    root_node = next(iter(tree))
    feature_value = instance[root_node]
    if feature_value in tree[root_node]:
        return predict(tree[root_node][feature_value], instance)
    return None

# 10. Evaluate the model and finding accuracy
def evaluate(tree, test_data_m, label):
    correct_predict = 0
    wrong_predict = 0
    for index, row in test_data_m.iterrows():
        result = predict(tree, test_data_m.iloc[index])
        if result == test_data_m[label].iloc[index]:
            correct_predict += 1
        else:
            wrong_predict += 1
    accuracy = correct_predict / (correct_predict + wrong_predict)
    return accuracy

# Running the ID3 algorithm with the synthetic dataset
# Replace `train_data_m` with your actual training dataset.
tree = id3(train_data_m, 'Play Tennis')

# Evaluate accuracy (use a different dataset for testing if you want)
accuracy = evaluate(tree, train_data_m, 'Play Tennis')
print(f"Accuracy: {accuracy}")


Accuracy: 1.0
