In [1]:
# Define the dataset
data = [
    ['<=30', 'high', 'no', 'fair', 'no'],
    ['<=30', 'high', 'no', 'excellent', 'no'],
    ['31...40', 'high', 'no', 'fair', 'yes'],
    ['>40', 'medium', 'no', 'fair', 'yes'],
    ['>40', 'low', 'yes', 'fair', 'yes'],
    ['>40', 'low', 'yes', 'excellent', 'no'],
    ['31...40', 'low', 'yes', 'excellent', 'yes'],
    ['<=30', 'medium', 'no', 'fair', 'no'],
    ['<=30', 'low', 'yes', 'fair', 'yes'],
    ['>40', 'medium', 'yes', 'fair', 'yes'],
    ['<=30', 'medium', 'yes', 'excellent', 'yes'],
    ['31...40', 'medium', 'no', 'excellent', 'yes'],
    ['31...40', 'high', 'yes', 'fair', 'yes'],
    ['>40', 'medium', 'no', 'excellent', 'no']
]

# Count instances for each class
class_counts = {'yes': 0, 'no': 0}
for instance in data:
    buys_computer = instance[-1]
    class_counts[buys_computer] += 1

# Calculate prior probabilities
total_instances = len(data)
prior_prob_yes = class_counts['yes'] / total_instances
prior_prob_no = class_counts['no'] / total_instances

# Print the results
print(f'Prior Probability for buys_computer = yes: {prior_prob_yes:.3f}')
print(f'Prior Probability for buys_computer = no: {prior_prob_no:.3f}')


Prior Probability for buys_computer = yes: 0.643
Prior Probability for buys_computer = no: 0.357


In [6]:
from scipy.stats import norm
import numpy as np

# Define the dataset
# (your dataset definition here)

# Separate data by class
data_by_class = {'yes': [], 'no': []}
for instance in data:
    buys_computer = instance[-1]
    data_by_class[buys_computer].append(instance)

# Calculate class conditional densities for each feature and class
class_conditional_densities = {}
for feature_index in range(len(data[0]) - 1):  # Exclude the target class in the last column
    for buys_computer in ['yes', 'no']:
        feature_values = [instance[feature_index] for instance in data_by_class[buys_computer]]
        unique_values = set(feature_values)

        for value in unique_values:
            # Filter out '31...40' and non-numeric values
            feature_data = [float(instance[feature_index]) for instance in data_by_class[buys_computer] if instance[feature_index] == value and instance[feature_index].replace('.', '').replace('...','').isdigit() and instance[feature_index] != '31...40']

            if feature_data:
                # For continuous features, fit a normal distribution
                pdf_params = norm.fit(feature_data)
                pdf = norm.pdf(np.linspace(min(feature_data), max(feature_data), 100), *pdf_params)

                # Check if any class conditional density has zero values
                if any(pdf == 0):
                    print(f"Warning: Class conditional density for feature {feature_index + 1}, value {value}, class {buys_computer} has zero values.")

                class_conditional_densities[(feature_index + 1, value, buys_computer)] = pdf
            else:
                # For discrete features, count occurrences
                count = feature_values.count(value)
                class_conditional_densities[(feature_index + 1, value, buys_computer)] = count

# Print the results (class conditional densities)
for key, value in class_conditional_densities.items():
    print(f"Class conditional density for feature {key[0]}, value {key[1]}, class {key[2]}:")
    print(value)
    print()


Class conditional density for feature 1, value 31...40, class yes:
4

Class conditional density for feature 1, value >40, class yes:
3

Class conditional density for feature 1, value <=30, class yes:
2

Class conditional density for feature 1, value >40, class no:
2

Class conditional density for feature 1, value <=30, class no:
3

Class conditional density for feature 2, value high, class yes:
2

Class conditional density for feature 2, value medium, class yes:
4

Class conditional density for feature 2, value low, class yes:
3

Class conditional density for feature 2, value high, class no:
2

Class conditional density for feature 2, value medium, class no:
2

Class conditional density for feature 2, value low, class no:
1

Class conditional density for feature 3, value no, class yes:
3

Class conditional density for feature 3, value yes, class yes:
6

Class conditional density for feature 3, value no, class no:
4

Class conditional density for feature 3, value yes, class no:
1

Class

In [11]:
import numpy as np
from scipy.stats import chi2_contingency

# Define the dataset
# (your dataset definition here)

# Select the relevant columns for testing independence
selected_features = [0, 1, 2, 3]  # Adjust these indices based on your dataset
selected_data = np.array([[instance[i] for i in selected_features] for instance in data])

# Create a contingency table
contingency_table = []
max_len = 0  # Initialize the maximum length
for i in range(len(selected_features)):
    feature_values = set(selected_data[:, i])
    counts = [np.sum(selected_data[:, i] == value) for value in feature_values]
    
    # Update the maximum length
    max_len = max(max_len, len(counts))
    
    contingency_table.append(counts)

# Pad the counts lists with zeros to make them uniform
contingency_table = [counts + [0] * (max_len - len(counts)) for counts in contingency_table]

# Convert the contingency table to a NumPy array
contingency_table = np.array(contingency_table)

# Perform the chi-square test of independence
chi2, p, _, _ = chi2_contingency(contingency_table)

# Print the results
print(f"Chi-Square Value: {chi2}")
print(f"P-value: {p}")

# Check the significance level (e.g., 0.05)
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis: There is evidence of dependence between the features.")
else:
    print("Fail to reject the null hypothesis: There is no significant evidence of dependence between the features.")


Chi-Square Value: 11.277167277167278
P-value: 0.08017877950576319
Fail to reject the null hypothesis: There is no significant evidence of dependence between the features.
