In [97]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-2024-f/train_final.csv
/kaggle/input/ml-2024-f/test_final.csv


In [98]:
train_data = pd.read_csv("/kaggle/input/ml-2024-f/train_final.csv")
test_data = pd.read_csv("/kaggle/input/ml-2024-f/test_final.csv")

In [99]:
for col in train_data.columns:
    if train_data[col].isnull().any():
        most_common_value = train_data[col].mode()[0]
        train_data[col].fillna(most_common_value, inplace=True)

In [100]:
for col in test_data.columns:
    if test_data[col].isnull().any():
        most_common_value = test_data[col].mode()[0]
        test_data[col].fillna(most_common_value, inplace=True)

In [101]:
categorical_cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 
                    'capital.gain', 'capital.loss', 'hours.per.week', 'native.country']
train_data = pd.get_dummies(train_data, columns=categorical_cols)
test_data = pd.get_dummies(test_data, columns=categorical_cols)

In [102]:
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

In [103]:
X = train_data.drop('income>50K', axis=1)  
y = train_data['income>50K']  
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [104]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  

X_train = pd.DataFrame(X_train, columns=X.columns)

X_val = scaler.transform(X_val)  
X_val = pd.DataFrame(X_val, columns=X.columns)

X_test = scaler.transform(test_data.drop('income>50K', axis=1))  
X_test = pd.DataFrame(X_test, columns=X.columns)


In [124]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization

model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    BatchNormalization(),
    LeakyReLU(negative_slope=0.2),
    #Dropout(0.1),  
    Dense(64, kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    BatchNormalization(),
    LeakyReLU(negative_slope=0.2),
    #Dropout(0.1),  
    Dense(32, kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    LeakyReLU(negative_slope=0.2),
    Dense(1, activation='sigmoid')
])


In [131]:
from tensorflow.keras.optimizers import Nadam
model.compile(
    optimizer=Nadam(learning_rate=0.0005), 
    loss='binary_crossentropy',
    metrics=['accuracy']
)
#from tensorflow.keras.callbacks import EarlyStopping
#early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [132]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30, 
    batch_size=32,
    verbose=1
)

Epoch 1/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 28ms/step - accuracy: 0.9207 - loss: 0.2278 - val_accuracy: 0.8604 - val_loss: 0.3413
Epoch 2/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 26ms/step - accuracy: 0.9548 - loss: 0.1711 - val_accuracy: 0.8508 - val_loss: 0.4857
Epoch 3/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.9775 - loss: 0.1191 - val_accuracy: 0.8528 - val_loss: 0.5506
Epoch 4/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.9809 - loss: 0.0978 - val_accuracy: 0.8520 - val_loss: 0.6425
Epoch 5/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.9809 - loss: 0.0948 - val_accuracy: 0.8526 - val_loss: 0.6640
Epoch 6/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.9799 - loss: 0.0883 - val_accuracy: 0.8488 - val_loss: 0.6540
Epoch 7/30
[1m6

In [133]:
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Accuracy: 0.8580


In [134]:
predictions = model.predict(X_test).flatten()

[1m746/746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step


In [135]:
submission = pd.DataFrame({
    'ID': np.arange(1, len(predictions) + 1),  
    'Prediction': predictions
})