In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-2024-f/train_final.csv
/kaggle/input/ml-2024-f/test_final.csv


In [45]:
train_data = pd.read_csv("/kaggle/input/ml-2024-f/train_final.csv")
test_data = pd.read_csv("/kaggle/input/ml-2024-f/test_final.csv")

In [47]:
for col in train_data.columns:
    if train_data[col].isnull().any():
        most_common_value = train_data[col].mode()[0]
        train_data[col].fillna(most_common_value, inplace=True)

In [48]:
for col in test_data.columns:
    if test_data[col].isnull().any():
        most_common_value = test_data[col].mode()[0]
        test_data[col].fillna(most_common_value, inplace=True)

In [49]:
categorical_cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 
                    'capital.gain', 'capital.loss', 'hours.per.week', 'native.country']
train_data = pd.get_dummies(train_data, columns=categorical_cols)
test_data = pd.get_dummies(test_data, columns=categorical_cols)

In [50]:
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

In [51]:
X = train_data.drop('income>50K', axis=1) 
y = train_data['income>50K']  
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test_data.drop('income>50K', axis=1).values)

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

model = Sequential([
    Input(shape=(X_train.shape[1],)),  
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid') 
])

In [57]:
model.compile(
    optimizer='adam',  
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

In [58]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,  
    batch_size=32,
    verbose=1
)

Epoch 1/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 20ms/step - accuracy: 0.8182 - loss: 0.4111 - val_accuracy: 0.7312 - val_loss: 0.5897
Epoch 2/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9406 - loss: 0.1245 - val_accuracy: 0.8298 - val_loss: 0.5382
Epoch 3/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9576 - loss: 0.0765 - val_accuracy: 0.8314 - val_loss: 0.5731
Epoch 4/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - accuracy: 0.9674 - loss: 0.0561 - val_accuracy: 0.8336 - val_loss: 0.5853
Epoch 5/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9756 - loss: 0.0457 - val_accuracy: 0.8286 - val_loss: 0.6410
Epoch 6/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - accuracy: 0.9777 - loss: 0.0403 - val_accuracy: 0.8274 - val_loss: 0.6629
Epoch 7/30
[1m6

In [59]:
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Accuracy: 0.7986


In [60]:
predictions = model.predict(X_test).flatten()

[1m746/746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step


In [62]:
submission = pd.DataFrame({
    'ID': np.arange(1, len(predictions) + 1),  
    'Prediction': predictions
})

In [64]:
submission.to_csv('/kaggle/working/submission.csv', index=False)