In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-2024-f/train_final.csv
/kaggle/input/ml-2024-f/test_final.csv


In [11]:
train_data = pd.read_csv("/kaggle/input/ml-2024-f/train_final.csv")
test_data = pd.read_csv("/kaggle/input/ml-2024-f/test_final.csv")

In [12]:
for col in train_data.columns:
    if train_data[col].isnull().any():
        most_common_value = train_data[col].mode() 
        train_data[col].fillna(most_common_value, inplace=True)

In [13]:
categorical_cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 
                    'capital.gain', 'capital.loss', 'hours.per.week', 'native.country']
train_data = pd.get_dummies(train_data, columns=categorical_cols)
test_data = pd.get_dummies(test_data, columns=categorical_cols)

In [14]:
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
X_train = train_data.drop('income>50K', axis=1)
y_train = train_data['income>50K']

In [15]:
print("Columns in train_data:", train_data.columns)

Columns in train_data: Index(['income>50K', 'age_17', 'age_18', 'age_19', 'age_20', 'age_21',
       'age_22', 'age_23', 'age_24', 'age_25',
       ...
       'native.country_Portugal', 'native.country_Puerto-Rico',
       'native.country_Scotland', 'native.country_South',
       'native.country_Taiwan', 'native.country_Thailand',
       'native.country_Trinadad&Tobago', 'native.country_United-States',
       'native.country_Vietnam', 'native.country_Yugoslavia'],
      dtype='object', length=18359)


In [16]:
X = train_data.drop('income>50K', axis=1) 
y = train_data['income>50K']  
kf = KFold(n_splits=5, shuffle=True, random_state=42) 

In [17]:
def calc_entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * math.log2(p) for p in probabilities if p > 0])

In [18]:
def info_gain(data, attribute, target):
    total_entropy = calculate_entropy(target)
    
    values, counts = np.unique(data[attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * calc_entropy(target[data[attribute] == v]) for i, v in enumerate(values))
    
    gain = total_entropy - weighted_entropy
    return gain

In [19]:
def build_tree(data, attributes, target, depth=0, max_depth=5):
    if len(np.unique(target)) == 1:  
        return np.unique(target)[0]
    elif len(attributes) == 0 or depth >= max_depth:
        return target.mode()[0] 

    gains = {attr: info_gain(data, attr, target) for attr in attributes}
    best_attr = max(gains, key=gains.get)
    
    tree = {best_attr: {}}
    attributes = [attr for attr in attributes if attr != best_attr]

    for value in np.unique(data[best_attr]):
        subset = data[data[best_attr] == value]
        sub_target = target[subset.index]
        subtree = build_tree(subset, attributes, sub_target, depth + 1, max_depth)
        tree[best_attr][value] = subtree

    return tree

In [20]:
def predict(tree, example):
    if not isinstance(tree, dict):
        return tree 
    
    attribute = next(iter(tree))
    value = example[attribute]
    subtree = tree[attribute].get(value)
    
    if subtree is None: 
        return 0 
    return predict(subtree, example)

In [22]:
import math
tree = build_tree(X_train, X_train.columns, y_train)

In [23]:
predictions = [predict(tree, row) for _, row in test_data.iterrows()]

In [27]:
submission = pd.DataFrame({
    'ID': np.arange(1, len(test_data) + 1), 
    'Prediction': predictions
})

In [28]:
submission[['ID', 'Prediction']].to_csv('/kaggle/working/submission.csv', index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
