<a href="https://colab.research.google.com/github/Sreyareddy13/DL-CA1/blob/main/CA2_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import balanced_accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import h5py
import joblib
# Load and prepare data
df = pd.read_csv('Group A Dataset.csv')
df['label'] = df['label'].str.replace('.', '')  # Clean labels

# Remove specified columns
df = df.drop(['fnlwgt', 'education'], axis=1)

# Separate features and target
X = df.drop('label', axis=1)
y = df['label'].map({'<=50K': 0, '>50K': 1})  # Binary encoding

# Identify categorical and numerical columns
cat_cols = ['workclass', 'marital_status', 'occupation',
            'relationship', 'race', 'sex', 'native_country']
num_cols = ['age', 'education_num', 'capital_gain',
            'capital_loss', 'hour_per_week']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

# Preprocess data
X_processed = preprocessor.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42)

# Calculate maximum parameters (1000)
# Architecture designed to stay under 1000 parameters
n_features = X_train.shape[1]
hidden1_units = 8  # (n_features+1)*8 < 500
hidden2_units = 4  # (8+1)*4 = 36
output_units = 1   # (4+1)*1 = 5
total_params = (n_features+1)*hidden1_units + (hidden1_units+1)*hidden2_units + (hidden2_units+1)*output_units
print(f"Total trainable parameters: {total_params} (must be <= 1000)")

# Build neural network with L2 regularization
model = Sequential([
    Dense(hidden1_units, activation='relu',
          input_shape=(n_features,),
          kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(hidden2_units, activation='relu',
          kernel_regularizer=l2(0.001)),
    Dropout(0.2),
    Dense(output_units, activation='sigmoid')
])

# Compile with AdamW optimizer
optimizer = AdamW(learning_rate=0.001, weight_decay=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train with early stopping
history = model.fit(X_train, y_train,
                    epochs=100,
                    batch_size=128,
                    validation_split=0.2,
                    verbose=1)

model.save('income_classifier_model.h5')

# Save preprocessor
import joblib
joblib.dump(preprocessor, 'preprocessor.pkl')

# Test script would look like this:
def load_and_test(model_path, test_data_path):
    # Load model
    model = tf.keras.models.load_model(model_path)

    # Load and preprocess test data (same preprocessing as training)
    df_test = pd.read_csv(test_data_path)
    df_test['label'] = df_test['label'].str.replace('.', '')
    X_test = df_test.drop(['label', 'fnlwgt', 'education'], axis=1)
    y_test = df_test['label'].map({'<=50K': 0, '>50K': 1})

    # Preprocess
    X_test_processed = preprocessor.transform(X_test)

    # Predict and calculate balanced accuracy
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.4f}")
# Example usage:
# bal_acc = load_and_test('income_predictor.h5', 'test_data.csv')

Total trainable parameters: 769 (must be <= 1000)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.7159 - loss: 0.6310 - val_accuracy: 0.8288 - val_loss: 0.4304
Epoch 2/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8032 - loss: 0.4581 - val_accuracy: 0.8501 - val_loss: 0.3627
Epoch 3/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8248 - loss: 0.4115 - val_accuracy: 0.8552 - val_loss: 0.3488
Epoch 4/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8344 - loss: 0.3884 - val_accuracy: 0.8570 - val_loss: 0.3429
Epoch 5/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8306 - loss: 0.3905 - val_accuracy: 0.8577 - val_loss: 0.3393
Epoch 6/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8333 - loss: 0.3797 - val_accuracy: 0.8567 - val_loss: 0.3369
Epoch 7/100
[1m123/1



In [17]:
y_pred = (model.predict(X_test) > 0.5).astype(int)
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.4f}")

[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Balanced Accuracy: 0.7669
