In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy

# --- 1. Load and Clean Data ---
# Load the dataset. We assume the 'AdultUCI.csv' file is in the same directory.
# The data description suggests some columns might have trailing spaces/periods.
# We will use the 'na_values' parameter to treat ' ?' (common in this dataset) as NaN,
# although the initial snippet didn't show it, it is a common issue with this dataset.
try:
    # Using the 'python' engine for potentially better handling of irregular lines
    df = pd.read_csv('AdultUCI.csv', skipinitialspace=True, na_values=['?'], engine='python')
except FileNotFoundError:
    print("Error: 'AdultUCI.csv' not found. Please ensure the file is in the correct path.")
    exit()

# The dataset snippet shows 'NA' in the income column for some rows.
# Let's check for and remove rows with missing values, as requested.
# We strip whitespace from all string columns to ensure proper cleaning and encoding.
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()
    # Replace ' ?' values found in some columns with NaN
    df[col] = df[col].replace('?', np.nan)

# Drop all cases (rows) with empty values (NaNs)
df_cleaned = df.dropna()
print(f"Original shape: {df.shape}")
print(f"Cleaned shape after dropping NaNs: {df_cleaned.shape}")

# Separate features (X) and target (y)
X = df_cleaned.drop('income', axis=1)
# The target variable needs to be binary: 1 for >50K, 0 for <=50K.
# Based on the snippet 'small' seems to be <=50K and 'large' or implied >50K.
# We'll map the income column to 0 and 1. 'small' or '<=50K' to 0, and the rest ('>50K') to 1.
# The actual labels in this specific file are often '<=50K' and '>50K', but
# the snippet shows 'small'. We'll check the unique values and map them.
if '<=50K' in df_cleaned['income'].unique():
    # Use standard labels if they exist
    y = (df_cleaned['income'] == '>50K').astype(int)
else:
    # Use 'small' as a proxy for the lower income if standard labels aren't present
    # Assuming 'small' means <=50K and anything else is >50K (or 'large' which isn't explicit but implied)
    y = (df_cleaned['income'] != 'small').astype(int)

# --- 2. Encode the numeric and categorical features appropriately ---

# Identify feature types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

# Define preprocessor: Standard Scaler for numeric, One-Hot Encoder for categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (none in this case, but good practice)
)

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# Determine the number of features after one-hot encoding
input_dim = X_processed.shape[1]
print(f"Number of features after encoding: {input_dim}")

# --- 3. Split the dataset as training (80%) and test data (20%) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training data size (80%): {X_train.shape[0]}")
print(f"Test data size (20%): {X_test.shape[0]}")

# --- 4. Build an MLP model with one hidden layer ---
# A common choice for a binary classification hidden layer is a power of 2,
# or a size between the input and output size. We'll use 64 nodes.
# Output layer uses sigmoid for binary probability prediction.
# Loss is binary_crossentropy, and the metric is accuracy.
model = Sequential([
    # Hidden layer: use ReLU activation, standard for hidden layers
    Dense(64, activation='relu', input_shape=(input_dim,)),
    # Output layer: 1 unit with sigmoid activation for probability
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=[BinaryAccuracy(name='accuracy')])

print("\nModel Summary:")
model.summary()

# Train the model for 50 epochs
print("\nTraining the MLP model (50 epochs)...")
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    verbose=0, # Set to 1 or 2 to see training progress
    validation_data=(X_test, y_test)
)

# Report the accuracy on the test data after 50 epochs
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
test_accuracy = accuracy * 100
print(f"\n--- Model Evaluation ---")
print(f"Test Data Accuracy after 50 epochs: {test_accuracy:.4f}%")
print(f"Test Data Loss after 50 epochs: {loss:.4f}")

# --- 5. Predict the probability for a new sample ---
new_sample_data = {
    'age': 42,
    'workclass': 'State-gov',
    'fnlwgt': 57223,
    'education': 'Bachelors',
    'education-num': 13,
    'marital-status': 'Never-married',
    'occupation': 'Adm-clerical',
    'relationship': 'Not-in-family',
    'race': 'White',
    'sex': 'Female',
    'capital-gain': 2714,
    'capital-loss': 0,
    'hours-per-week': 40,
    'native-country': 'United-States'
}

# Convert the new sample to a DataFrame
new_sample = pd.DataFrame([new_sample_data])

# Pre-process the new sample using the fitted preprocessor
# The column order must match the training data before processing.
# We explicitly define the columns here to ensure the order is correct.
original_columns = X.columns.tolist()
new_sample_processed = preprocessor.transform(new_sample[original_columns])

# Predict the probability
# The prediction returns the probability P(income > $50K)
probability_prediction = model.predict(new_sample_processed, verbose=0)[0][0]

print(f"\n--- New Sample Prediction ---")
print(f"New Sample Data:")
for key, value in new_sample_data.items():
    print(f"  {key}: {value}")

print(f"\nPredicted probability that the person’s income is greater than $50K/yr: {probability_prediction:.4f}")

# Optional: Predict the class (0 or 1)
predicted_class = (probability_prediction > 0.5).astype(int)
print(f"Predicted Class (1 for >$50K, 0 for <=$50K): {predicted_class}")

Original shape: (48842, 15)
Cleaned shape after dropping NaNs: (30162, 15)
Number of features after encoding: 104
Training data size (80%): 24129
Test data size (20%): 6033

Model Summary:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training the MLP model (50 epochs)...

--- Model Evaluation ---
Test Data Accuracy after 50 epochs: 84.3030%
Test Data Loss after 50 epochs: 0.3462

--- New Sample Prediction ---
New Sample Data:
  age: 42
  workclass: State-gov
  fnlwgt: 57223
  education: Bachelors
  education-num: 13
  marital-status: Never-married
  occupation: Adm-clerical
  relationship: Not-in-family
  race: White
  sex: Female
  capital-gain: 2714
  capital-loss: 0
  hours-per-week: 40
  native-country: United-States

Predicted probability that the person’s income is greater than $50K/yr: 0.0152
Predicted Class (1 for >$50K, 0 for <=$50K): 0
