In [0]:
import numpy as np
import pandas as pd

In [0]:
def sigmoid(z):
  res = 1/(1 + np.exp(-z))
  return res

In [0]:
def predict(features, weights):
  mult = np.dot(features, weights)
  return sigmoid(mult)

In [0]:
def cost_function(features, labels, weights):
  
  # Cost = (labels*log(predictions) + (1-labels)*log(1-predictions) ) / len(labels)
  
  observations = len(labels)
  Pred = predict(features, weights)
  
  # Take error when label = 1
  Cost1 = -labels * np.log(Pred)
  
  # Take error when label = 0
  Cost2 = (1 - labels) * np.log(1 - Pred)
  
  Cost = Cost1 + Cost2
  cost = Cost.sum() / observations
  return cost

In [0]:
def update_weights(features, labels, weights, lr):
  N = len(features)
  
  # Get predictions
  Pred = predict(features, weights)
  grad = np.dot(features.T, Pred - labels)
  
  # Take the average cost derivative for each feature
  grad /= N
  
  # Multiply the gradient by our learning rate
  grad *= lr
  
  # Subtract from our weights to minimize cost
  weights -= grad
  
  return weights 

In [0]:
# With Sigmoid ranging from 0 to 1, the probability of 0.5 can be used as the decision boundary or threshold to determine the classes.

def decision_boundary(prob):
  return 1 if prob >= 0.5 else 0

In [0]:
def accuracy(predicted_labels, actual_labels):
    diff = predicted_labels - actual_labels
    return 1.0 - (float(np.count_nonzero(diff)) / len(diff))

In [0]:
def train_with_file(data_file, iters):
  col = ['age','workclass','fnlgwt','education','education-num','marital-status',
          'occupation','relationship','race','sex','capital-gain','capital-loss',
          'hours-per-week','native-country','income']
  
  Train = pd.read_csv(data_file, header=None, names = col)
  
  X_train = Train.iloc[:,:-1]
  y_train = Train.iloc[:,-1]
  
  X_train.replace('?', np.nan, inplace=True)
  y_train.replace('?', np.nan, inplace=True)
  y_train = y_train.apply(lambda x: 1 if x == '>50K' else 0)
  
  X_train['workclass'].fillna('0', inplace=True)
  X_train['occupation'].fillna('0', inplace=True)
  X_train['native-country'].fillna('0', inplace=True)
  
  numerical = ['age', 'fnlgwt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
  categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
  
  p = dict(keys=numerical)
  
  # Creating dictionary
  
  for i in range(len(numerical)):
    a = min(X_train[numerical[i]])
    b = max(X_train[numerical[i]])
    p[numerical[i]] = (a, b)
    
  for j in range(len(numerical)):
    d = numerical[j]
    X_train[d] = X_train[d].apply(lambda x: (x-p[d][0]) / ((p[d][1] - p[d][0])))
    
    X_train = pd.get_dummies(X_train)
    
    global clf
    clf = X_train.columns
    
    weights = np.random.rand(X_train.shape[1])
    cost_history = []
    lr = 0.1
    
    for i in range(iters):
      weights = update_weights(X_train, y_train, weights, lr)
      
      Cost = cost_function(X_train, y_train, weights)
      cost_history.append(Cost)
      
    return weights, p

In [0]:
def classify(data_file, weights, normalization_params):
  col = ['age','workclass','fnlgwt','education','education-num','marital-status',
          'occupation','relationship','race','sex','capital-gain','capital-loss',
          'hours-per-week','native-country','income']
  
  Test = pd.read_csv(data_file, names = col, skiprows=1)
  
  X_test = Test.iloc[:,:-1]
  y_test = Test.iloc[:,-1]
  
  X_test.replace('?', np.nan, inplace=True)
  y_test.replace('?', np.nan, inplace=True)
  y_test = y_test.apply(lambda x: 1 if x == '>50K' else 0)
  
  X_test['workclass'].fillna('0', inplace=True)
  X_test['occupation'].fillna('0', inplace=True)
  X_test['native-country'].fillna('0', inplace=True)
  
  numerical = ['age', 'fnlgwt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
  categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
  
  N = normalization_params
  for j in range(len(numerical)):
    d = numerical[j]
    X_test[d] = X_test[d].apply(lambda x: (x-N[d][0]) / ((N[d][1] - N[d][0])))
    
  X_test = pd.get_dummies(X_test)
    
  missing_cols = set(clf) - set(X_test.columns)
  for w in missing_cols:
    X_test[w] = 0
  X_test = X_test[clf]

  Pred = predict(X_test,weights)
  Pred = np.array(list(map(decision_boundary,Pred)))

  return Pred
    

In [0]:
def sample_main():
  weights, normalization_params = train_with_file('adult-training.csv', 1000)
  labels = classify('adult-test.csv', weights, normalization_params)