In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [None]:
# Load data
data_train = pd.read_csv('/mnt/data/adult.data', header=None)
data_test = pd.read_csv('/mnt/data/adult.test', header=None, skiprows=1)

# Combine train and test data for preprocessing
data = pd.concat([data_train, data_test], ignore_index=True)


In [None]:
# Data preprocessing
# Assuming the last column is the target and first column includes categorical data that needs encoding
label_enc = LabelEncoder()
data.iloc[:, -1] = label_enc.fit_transform(data.iloc[:, -1])

# Split data back into train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
train_features = scaler.fit_transform(train_data.drop(train_data.columns[-1], axis=1))
test_features = scaler.transform(test_data.drop(test_data.columns[-1], axis=1))

train_labels = train_data.iloc[:, -1].values
test_labels = test_data.iloc[:, -1].values


In [None]:
# Model utilities
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, weights):
    m = len(y)
    predictions = sigmoid(np.dot(X, weights))
    cost = -1/m * np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
    return cost

def gradient_descent(X, y, weights, learning_rate, iterations):
    m = len(y)
    cost_history = []
    for i in range(iterations):
        predictions = sigmoid(np.dot(X, weights))
        weights -= learning_rate * (1/m) * np.dot(X.T, (predictions - y))
        cost_history.append(compute_cost(X, y, weights))
    return weights, cost_history
