In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
train_data = pd.read_csv("/kaggle/input/ml-2024-f/train_final.csv")
test_data = pd.read_csv("/kaggle/input/ml-2024-f/test_final.csv")

In [5]:
for col in train_data.columns:
    if train_data[col].isnull().any():
        most_common_value = train_data[col].mode()[0]
        train_data[col].fillna(most_common_value, inplace=True)

In [7]:
categorical_cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 
                    'capital.gain', 'capital.loss', 'hours.per.week', 'native.country']
train_data = pd.get_dummies(train_data, columns=categorical_cols)
test_data = pd.get_dummies(test_data, columns=categorical_cols)

In [8]:
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

In [9]:
X_train = train_data.drop('income>50K', axis=1)
y_train = train_data['income>50K']

In [10]:
def calculate_mean(y):
    return np.mean(y)

In [12]:
def build_regression_tree(data, attributes, target, depth=0, max_depth=5):
    if len(np.unique(target)) == 1 or depth >= max_depth or len(attributes) == 0:
        return calculate_mean(target)
    
    best_attr = attributes[0]  
    best_gain = -np.inf       
    
    for attr in attributes:
        gain = calculate_information_gain(data, attr, target)
        if gain > best_gain:
            best_gain = gain
            best_attr = attr

    tree = {best_attr: {}}
    attributes = [attr for attr in attributes if attr != best_attr]

    for value in np.unique(data[best_attr]):
        subset = data[data[best_attr] == value]
        sub_target = target[subset.index]
        subtree = build_regression_tree(subset, attributes, sub_target, depth + 1, max_depth)
        tree[best_attr][value] = subtree

    return tree

In [13]:
def predict_probability(tree, example):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    value = example.get(attribute, None)
    subtree = tree[attribute].get(value, np.nan)
    if isinstance(subtree, dict):
        return predict_probability(subtree, example)
    return subtree

In [22]:
def calculate_information_gain(data, attribute, target):
    total_entropy = calculate_entropy(target)
    
    values, counts = np.unique(data[attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * calculate_entropy(target[data[attribute] == v]) for i, v in enumerate(values))
    
    gain = total_entropy - weighted_entropy
    return gain

In [23]:
def calculate_entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * math.log2(p) for p in probabilities if p > 0])

In [24]:
tree = build_regression_tree(X_train, X_train.columns, y_train)

In [26]:
predictions = [predict_probability(tree, row) for _, row in test_data.iterrows()]

In [27]:
submission = pd.DataFrame({
    'ID': np.arange(1, len(test_data) + 1),  
    'Prediction': predictions
})

In [28]:
submission[['ID', 'Prediction']].to_csv('/kaggle/working/submission.csv', index=False)