In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input, concatenate
from tensorflow.keras.callbacks import EarlyStopping

# Read the file with specified column names
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'sex',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'label'
]

# Assuming you have your DataFrame loaded into df
# Replace the following line with your actual data loading process
df = pd.read_csv("adult.csv", header=None, names=column_names, index_col=False)

# Assume X contains features and y contains labels
X = df.drop(columns='label')
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define your numeric and categorical features
numeric_features = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week', 'education_num']
categorical_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

# Define your preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Transform the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test data using the same preprocessing steps
X_test_processed = preprocessor.transform(X_test)

# Assuming vocab_size is the number of unique values in your categorical features
vocab_size = X_train['native_country'].nunique()

# Define and compile your deep learning model
embedding_dim = min((vocab_size + 1) // 2, 50)

numeric_input = Input(shape=(len(numeric_features),))
categorical_input = Input(shape=(X_train_processed.shape[1] - len(numeric_features),))

embedding_layers = []
for i in range(X_train_processed.shape[1] - len(numeric_features)):
    embedding_layers.append(Embedding(input_dim=vocab_size, output_dim=embedding_dim)(categorical_input[:, i]))

categorical_output = Flatten()(concatenate(embedding_layers))
merged = concatenate([numeric_input, categorical_output])

output = Dense(1, activation='sigmoid')(merged)

model = Sequential([numeric_input, categorical_input], output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train your model
model.fit([X_train_processed[:, :len(numeric_features)], X_train_processed[:, len(numeric_features):]], y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(patience=3)])

# Evaluate on test data
y_pred = model.predict([X_test_processed[:, :len(numeric_features)], X_test_processed[:, len(numeric_features):]])
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Test Accuracy: {accuracy}')


2
4
7
13
7
14
6
11
3
5
2
3
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2



ValueError: `validation_split` is only supported for Tensors or NumPy arrays, found following types in the input: [<class 'scipy.sparse._csr.csr_matrix'>, <class 'scipy.sparse._csr.csr_matrix'>]