In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif, chi2
import numpy as np

# Sample dataset
documents = [
    "goal football win",
    "goal match team",
    "politics goal government election",
    "government policy election",
    "goal team sports",
    "sports match win",
    "government politics policy",
    "election goal campaign government",
]

# Labels: 1 = Sports, 0 = Politics
labels = [1, 1, 0, 0, 1, 1, 0, 0]

# Step 1: Bag-of-Words Representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)  # Convert text to numeric matrix
features = vectorizer.get_feature_names_out()  # Get feature names
print("Features:", features)

# Step 2: Gini Index
def gini_index(word_index, X, labels):
    word_column = X[:, word_index].toarray().flatten()
    total = len(labels)

    # Probabilities for classes
    p_class_1 = np.sum((word_column == 1) & (np.array(labels) == 1)) / np.sum(word_column)
    p_class_0 = np.sum((word_column == 1) & (np.array(labels) == 0)) / np.sum(word_column)

    gini = 1 - (p_class_1**2 + p_class_0**2)
    return gini

# Example: Gini Index for 'goal'
gini_goal = gini_index(features.tolist().index("goal"), X, labels)
print("Gini Index for 'goal':", gini_goal)

# Step 3: Information Gain (Mutual Information)
info_gain = mutual_info_classif(X, labels, discrete_features=True)
print("\nInformation Gain for each feature:")
for feature, ig in zip(features, info_gain):
    print(f"{feature}: {ig}")

# Step 4: Mutual Information (Specific to a class)
def mutual_information(word_index, X, labels):
    word_column = X[:, word_index].toarray().flatten()
    total_docs = len(labels)

    p_word = np.sum(word_column) / total_docs
    p_class_1 = np.sum(labels) / total_docs
    p_word_class_1 = np.sum((word_column == 1) & (np.array(labels) == 1)) / total_docs

    mi = np.log2(p_word_class_1 / (p_word * p_class_1)) if p_word_class_1 > 0 else 0
    return mi

# Example: Mutual Information for 'goal'
mi_goal = mutual_information(features.tolist().index("goal"), X, labels)
print(f"\nMutual Information for 'goal': {mi_goal}")

# Step 5: Chi-Square (χ²)
chi2_values, p_values = chi2(X, labels)
print("\nChi-Square (χ²) for each feature:")
for feature, chi2_val in zip(features, chi2_values):
    print(f"{feature}: {chi2_val}")


Features: ['campaign' 'election' 'football' 'goal' 'government' 'match' 'policy'
 'politics' 'sports' 'team' 'win']
Gini Index for 'goal': 0.0

Information Gain for each feature:
campaign: 0.09560258894703255
election: 0.38039566584857787
football: 0.09560258894703255
goal: 0.38039566584857787
government: 0.6931471805599452
match: 0.21576155433883548
policy: 0.21576155433883548
politics: 0.21576155433883548
sports: 0.21576155433883548
team: 0.21576155433883548
win: 0.21576155433883548

Mutual Information for 'goal': 1.0

Chi-Square (χ²) for each feature:
campaign: 1.0
election: 3.0
football: 1.0
goal: 3.0
government: 4.0
match: 2.0
policy: 2.0
politics: 2.0
sports: 2.0
team: 2.0
win: 2.0


Key Insights:
Features:

The feature names correspond to the unique words found in the documents. For instance, goal, football, election, sports, and politics are all present in the dataset.
Gini Index for 'goal':

The Gini index for 'goal' is 0.0, which indicates that this feature does not provide much discrimination between the classes (Sports vs Politics) in this dataset. In this case, a Gini index of 0.0 means that the word 'goal' is perfectly classified (all the documents with the word 'goal' belong to a single class).
Information Gain for Each Feature:

The Information Gain values are used to measure how well each feature (word) can distinguish between the classes. Higher values mean the feature is more informative.
For instance, the word government has the highest information gain (0.6931), indicating it is a strong feature for classification between the classes, whereas campaign, football, and goal have much lower values (~0.0956), suggesting they are less informative.
Mutual Information for 'goal':

The Mutual Information for the word 'goal' is 1.0, suggesting that there is a strong association between the word 'goal' and the class it is associated with. In this case, the strong positive mutual information indicates that knowing the word 'goal' almost guarantees knowledge of the class (Sports).
Chi-Square (χ²) for Each Feature:

The Chi-Square values represent how much the occurrence of a feature (word) deviates from what we would expect if the feature was independent of the class label.
The word government has the highest χ² value (4.0), indicating that it has a stronger association with the class compared to other words with lower χ² values (2.0 for words like match, policy, sports, etc.).
Interpretation:
Gini Index (0.0 for 'goal'): This is a perfect indicator of class, meaning all occurrences of 'goal' belong to a single class. However, in many real cases, we'd expect higher Gini Index values for better discriminatory power.
Information Gain: Words like government and election have the highest information gain, meaning they help the model classify more effectively between the two classes.
Mutual Information: A high value for 'goal' suggests strong dependence with the class, implying it's a useful word for classification.
Chi-Square: Words like government are significantly associated with specific classes, which is useful for classification purposes.