In [1]:
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
import os

In [6]:
CENSUS_DATASET = (
    "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
)

In [8]:
def download_data(path='data', urls=CENSUS_DATASET):
    if not os.path.exists(path):
        os.mkdir(path)

    for url in urls:
        response = requests.get(url)
        name = os.path.basename(url)
        with open(os.path.join(path, name), 'w') as f:
            f.write(response.content)

In [9]:
download_data()

In [3]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 
        'marital-status', 'occupation', 'relationship', 'race', 'gender', 
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [4]:
df_train = pd.read_csv('data/adult.data', sep=",", names = cols)
df_test = pd.read_csv('data/adult.test', sep=",", names = cols)

In [5]:
df_test.loc[df_test.income == ' <=50K.', ['income']] = '<=50K'
df_test.loc[df_test.income == ' >50K.', ['income']] = '>50K'
df_train.loc[df_train.income == ' <=50K', ['income']] = '<=50K'
df_train.loc[df_train.income == ' >50K', ['income']] = '>50K'

In [6]:
data = pd.concat((df_train, df_test), axis=0)

In [12]:
categorical_columns = [x for x in data.columns if data[x].dtype.name == 'object']
numerical_columns = [x for x in data.columns if data[x].dtype.name != 'object']
print categorical_columns
print numerical_columns

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [9]:
data = data[data['workclass'] != ' ?']
data = data[data['occupation'] != ' ?']
data = data[data['native-country'] != ' ?']

In [10]:
for x in categorical_columns:
    print data[x].unique()

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay']
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' 7th-8th' ' Doctorate' ' Assoc-voc' ' Prof-school'
 ' 5th-6th' ' 10th' ' Preschool' ' 12th' ' 1st-4th']
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Transport-moving' ' Farming-fishing'
 ' Machine-op-inspct' ' Tech-support' ' Craft-repair' ' Protective-serv'
 ' Armed-Forces' ' Priv-house-serv']
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
[' Male' ' Female']
[' United-States' ' Cuba' ' Jamaica' ' India' ' Mexico' ' Puerto-Rico'
 ' Honduras' ' England' ' Canada' ' Germany' ' Iran' ' Philippines'
 ' Poland' ' Colu

In [11]:
data.corr()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
age,1.0,-0.075792,0.037623,0.079683,0.059351,0.101992
fnlwgt,-0.075792,1.0,-0.041993,-0.00411,-0.004349,-0.018679
educational-num,0.037623,-0.041993,1.0,0.126907,0.081711,0.146206
capital-gain,0.079683,-0.00411,0.126907,1.0,-0.032102,0.08388
capital-loss,0.059351,-0.004349,0.081711,-0.032102,1.0,0.054195
hours-per-week,0.101992,-0.018679,0.146206,0.08388,0.054195,1.0


In [None]:
col1 = 