In [1]:
# Step 1: Discretize the feature Temperature
import pandas as pd

# Create the dataset
data = {
    'Temperature': [75, 80, 85, 70, 65, 60, 90],
    'Cloud_Cover': ['Sunny', 'Partly Cloudy', 'Overcast', 'Sunny', 'Overcast', 'Partly Cloudy', 'Overcast'],
    'Humidity': ['Low', 'High', 'High', 'Medium', 'Medium', 'Low', 'High'],
    'Weather': ['Sunny', 'Sunny', 'Rainy', 'Sunny', 'Stormy', 'Sunny', 'Rainy']
}

df = pd.DataFrame(data)

# Discretize the Temperature feature
df['Temperature_Level'] = pd.cut(df['Temperature'], bins=[-float('inf'), 70, 80, float('inf')], labels=['Warm', 'Hot', 'Very Hot'])

# Display the modified dataset
df


Unnamed: 0,Temperature,Cloud_Cover,Humidity,Weather,Temperature_Level
0,75,Sunny,Low,Sunny,Hot
1,80,Partly Cloudy,High,Sunny,Hot
2,85,Overcast,High,Rainy,Very Hot
3,70,Sunny,Medium,Sunny,Warm
4,65,Overcast,Medium,Stormy,Warm
5,60,Partly Cloudy,Low,Sunny,Warm
6,90,Overcast,High,Rainy,Very Hot


In [2]:
# Step 2: Calculate the Entropy of the target class Weather
import math

# Calculate entropy function
def calculate_entropy(values):
    total_records = len(values)
    unique_values = values.unique()

    entropy = 0
    for value in unique_values:
        probability = len(values[values == value]) / total_records
        entropy -= probability * math.log2(probability)

    return entropy

# Calculate the entropy of the target class Weather
weather_entropy = calculate_entropy(df['Weather'])
print("Entropy of Weather:", weather_entropy)


Entropy of Weather: 1.3787834934861753


In [3]:
# Step 3: Calculate the Information Gain for each feature
# Step 4: Order the features according to their Information Gain

# Calculate information gain function
def calculate_information_gain(data, feature, target):
    total_entropy = calculate_entropy(data[target])
    unique_values = data[feature].unique()

    information_gain = total_entropy
    for value in unique_values:
        subset = data[data[feature] == value]
        entropy = calculate_entropy(subset[target])
        probability = len(subset) / len(data)
        information_gain -= probability * entropy

    return information_gain

# Calculate information gain for each feature
features = ['Cloud_Cover', 'Humidity', 'Temperature_Level']
information_gains = {}

for feature in features:
    information_gain = calculate_information_gain(df, feature, 'Weather')
    information_gains[feature] = information_gain

# Order features based on information gain in descending order
sorted_features = sorted(information_gains.items(), key=lambda x: x[1], reverse=True)

# Display the features and their information gains
print("Features and Information Gains:")
for feature, gain in sorted_features:
    print(f"{feature}: {gain}")


Features and Information Gains:
Cloud_Cover: 0.9852281360342513
Temperature_Level: 0.9852281360342513
Humidity: 0.6995138503199656
