# Dependencies

In [3]:
# array processing for numbers, strings, records, objects
import numpy as np

# high performance, data structures and data analysis tools
import pandas as pd

# publication quality figures in python
import matplotlib.pyplot as plt
%matplotlib inline

# statistical data visualisation
import seaborn as sns

# generate pseudo-random numbers
import random

# pretty print arbitary data structures
from pprint import pprint

# Load Data
This adult.csv was made from adult.data file using Visual Studio Code, trailing spaces after commas were removed and headers were added.

In [4]:
# load and store in variable
data = pd.read_csv('../data/adult.csv')

# Dataset Information
A short summary of the datasets attributes

In [5]:
# dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
class             32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


# Sample Data - Before Preparation
This is the sample data before any processing has been made.

In [6]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Prepare Data
We must prepare the data for processing before prediction by:

### Initial Preparation
* Remove rows that have ? values
* Drop redundant columns
* Make class column have numeric values

In [7]:
# remove question mark rows
data = data[(data.astype(str) != '?').all(axis = 1)]

# drop education number - since it maps to education
data = data.drop('education-num', axis = 1)

# make class have numeric values, 1 (>50k) : 0 (<=50k)
data['tmp'] = data.apply(lambda row: 1 if '>50K'in row['class'] else 0, axis=1)

# drop alphabetic class column
data = data.drop(columns = ['class'])

# rename tmp to class
data = data.rename(columns={'tmp': 'class'})

# drop unnecessary attributes
# data = data.drop(columns = ['fnlwgt', 'capital-gain', 'capital-loss', 'native-country'])

### Further Preparation
* Check Best Feature using entropy and gain
* Drop columns with lowest gain that would lead to over-fitting

In [8]:
data_array = data.values

# Sample Data - After Preparation
This is the sample data after processing has been made.

In [9]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


# Split Data
We split data in to training and test data in a random manner for fair distribution

In [10]:
# function that splits data
def data_split(data, size):
    # account for propotional values
    if size < 1 and size > 0:
        size = int(size * len(data))
        
    # index of row in data
    indices = data.index.tolist()
    
    # set seed to generate same random data
    random.seed(0)
    
    # get random number of indices
    test_indices = random.sample(population = indices, k = size)
    
    # create testing data frame
    test_data = data.loc[test_indices]
    
    # create training data
    train_data = data.drop(test_indices)
    
    return train_data, test_data

# set train and test data
train_data, test_data = data_split(data, size = 0.2)

In [11]:
train_data.values

array([[39, 'State-gov', 77516, ..., 40, 'United-States', 0],
       [50, 'Self-emp-not-inc', 83311, ..., 13, 'United-States', 0],
       [53, 'Private', 234721, ..., 40, 'United-States', 0],
       ...,
       [58, 'Private', 151910, ..., 40, 'United-States', 0],
       [22, 'Private', 201490, ..., 20, 'United-States', 0],
       [52, 'Self-emp-inc', 287927, ..., 40, 'United-States', 1]],
      dtype=object)

# Information Gain
We want the feature **F** which gives the largest reduction in entropy over data.

### Attribute
Get unique values of feature **F**

In [12]:
np.unique(data['age'])
#data.iloc[:, 0]
#pd.DataFrame(np.unique(data['age']))
#def attribute_values():
#data[data['age']]
# make class have numeric values, 1 (>50k) : 0 (<=50k)

array([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 88, 90], dtype=int64)

### Feature
Several functions to help select feature
* Get all rows of feature and class
* Get only certain value rows of feature and class

In [13]:
# function that gets a data frame with feature and class
def feature_and_class(feature):
    f = '\'' + str(feature) + '\''
    return data.loc[:, [f, 'class']]

In [14]:
black = data[data['age'] == 39]
value = 29
#print('\'' + str(29) + '\'')

#np.unique(data['age'])

#for i in np.unique(data['education']):
 #   print(isinstance(i, np.object))
#data.loc[:, ['age', 'class']]
#data.columns
#for i in data.columns:
 #   print(i)
data[data["education"] == 'Bachelors']

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
9,42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1
11,30,State-gov,141297,Bachelors,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,1
12,23,Private,122272,Bachelors,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0
25,56,Local-gov,216851,Bachelors,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,1
32,45,Private,386940,Bachelors,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40,United-States,0
41,53,Self-emp-not-inc,88506,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,0
42,24,Private,172987,Bachelors,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,50,United-States,0


In [30]:
black = data[data['class'] == 1]
_, awe = np.unique(black['class'], return_counts = True)
print(awe[0])

7508


In [None]:
# get cetain rows
def probability_of_feature_value(value, feature):
    
    # store in data frame
    tmp = data[data[feature] == value]
    tmp = tmp.loc[:, [feature, 'class']]
    
    # get probabilities
    total = len(tmp)
    
    # tot
    tmp = tmp.drop(feature, axis = 1)
    _, tot_0 = np.unique(tmp[tmp['class'] == 0], return_counts = True)
    _, tot_1 = np.unique(tmp[tmp['class'] == 1], return_counts = True)
    
    if tot_0.size == 0: total_of_0 = 0
    else: total_of_0 = tot_0[0]
    
    if tot_1.size == 0: total_of_1 = 0
    else: total_of_1 = tot_1[0]
    
    # calculate
    prob_of_0 = total_of_0/total
    prob_of_1 = total_of_1/total
    
    # return tmp
    return prob_of_0, prob_of_1

In [None]:
data_no_last_column = data.drop('class', axis = 1)
for i in data_no_last_column.columns:
    for j in np.unique(data[i]):
        prob_of_0, prob_of_1 = probability_of_feature_value(j, i)
        print(prob_of_0, prob_of_1)

### Class
Determine the class of feature

In [17]:
# function that determines class and returns data frames
def determine_class(data_set):
    
    # define >50k data frame
    data_frame_1 = data_set[data_set['class'] == 1]
    
    # define <= 50k data frame
    data_frame_0 = data_set[data_set['class'] == 0]
    
    return data_frame_0, data_frame_1

### Entropy
Used to measure how good a certain feature is

In [18]:
def calculate_entroy():
    print('awe')