In [1]:
import pathlib
import urllib
import sys
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, iqr, skew, kurtosis

# 1. Collect data

In [5]:
path = pathlib.Path('../data/adult.csv')
if path.exists():
    print('adult dataset found!')
else:
    sys.stdout.write('Downloading the adult dataset from the Internet...')
    ADULTURL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    urllib.request.urlretrieve(ADULTURL, path.absolute())
    sys.stdout.write('Done!')

adult dataset found!


# 2. Understand the data in context


In [12]:
# Load the adult dataset into a Pandas dataframe
adult_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', '>50K']
adult = pd.read_csv(path.absolute(),names=adult_columns)

# 3. Understand the data
### 3.1 Describe the meansing and type for each attribute
 - **age**: numeric, continuous. 
 - **workclass**: categorical, discrete.
 - **fnlwgt**: numeric, continuous
 - **education**: categorical, discrete
 - **education-num**: numeric, continuous
 - **marital-status**: categorical, discrete
 - **occupation**: categorical, discrete
 - **relationship**: categorical, discrete
 - **race**: categorical, discrete
 - **sex**: categorical, binary
 - **capital-gain**: numeric, continuous
 - **capital-loss**: numeric, continuous
 - **hours-per-week**: numeric, continuous
 - **native-country**: categorical, discrete
 - **>50k**: categorical, binary

*3.1 subsection: explanation for non-self-explanatory attributes*
 - **fnlwgt**: represents final weight. according to [2], it's the number of units that this record could represent in the 
 target population. In [1], the author explained that the final weight is controlled by three factors: a single cell estimatation
 of the population 16+ for each state; controls for Hispanic Origin by age and sex; controls by race, age and sex.
 - **education_num**: represents the number of years of education in total.[2]
 - **relationship**: represents the individual's role in it's family.[2]
 - **capital_gain** and **capital_loss**: represents the income and loss from non-salary ways, e.g. investment.[2]
 
 
### 3.2 Verify data quality
#### 3.2.1 duplication

In [14]:
# Check duplication and drop duplication via pandas built in function
check_duplication = adult.duplicated()
duplicated = adult[check_duplication]
print("There are "+str(len(duplicated.index))+" duplicated data entries found.(One copy of the data entry will be kept in the dataset)")
adult = adult.drop_duplicates()
print("After drop duplicate, there are "+str(len(adult.index))+ " data entries remains.")

There are 0 duplicated data entries found.(One copy of the data entry will be kept in the dataset)
After drop duplicate, there are 32537 data entries remains.


#### 3.2.2 missing values

In [9]:
# drop the indexes for which column has value '?'
unknown_row =[]

for i in range(len(adult.index)):
    if ('?' in adult.iloc[i]['workclass']) or ('?' in adult.iloc[i]['occupation']) or ('?' in adult.iloc[i]['native-country']):
        unknown_row.append(i)
adult = adult.drop(index = unknown_row)
adult = adult.reset_index(drop=True)

32561
30162


In [10]:
# print(set(adult['education']))

{' Some-college', ' Bachelors', ' 1st-4th', ' 10th', ' 7th-8th', ' Prof-school', ' Assoc-voc', ' 11th', ' Assoc-acdm', ' Preschool', ' 9th', ' Masters', ' 5th-6th', ' 12th', ' HS-grad', ' Doctorate'}


In [300]:
#change >50k col to binary values
for i in range(len(adult.index)):
    if '>50K' in adult.iloc[i]['>50K']:
        adult.iloc[i, adult.columns.get_loc('>50K')] = 1
    else:
        adult.iloc[i,adult.columns.get_loc('>50K')] = 0

In [303]:
adult.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,>50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


### 1. Statistics about Numeric values

In [304]:
# select all columns with numeric values
adult_numeric = adult.select_dtypes(include=np.number)

In [305]:
adult_numeric.head(5)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,>50K
0,39,77516,13,2174,0,40,0
1,50,83311,13,0,0,13,0
2,38,215646,9,0,0,40,0
3,53,234721,7,0,0,40,0
4,28,338409,13,0,0,40,0


In [306]:
cnames = ['mean', 'median', 'mode', 'trimmed mean (p=20%)', 'min', 'max', 'range', 'std']
adult_numeric.agg(lambda x: pd.Series([np.mean(x), np.median(x),x.mode()[0], trim_mean(x, 0.2),x.min(),x.max(),x.max()-x.min(),x.std()], index=cnames))

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,>50K
mean,38.437902,189793.8,10.121312,1092.007858,88.372489,40.931238,0.248922
median,37.0,178425.0,10.0,0.0,0.0,40.0,0.0
mode,36.0,203488.0,9.0,0.0,0.0,40.0,0.0
trimmed mean (p=20%),37.360206,177534.8,10.024423,0.0,0.0,40.859653,0.081556
min,17.0,13769.0,1.0,0.0,0.0,1.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0
range,73.0,1470936.0,15.0,99999.0,4356.0,98.0,1.0
std,13.134665,105653.0,2.549995,7406.346497,404.29837,11.979984,0.432396


### 2. Statistics about Categorical values

In [307]:
# select all columns with categorical values
adult_categorical = adult.select_dtypes(include=['object'])

In [308]:
adult_categorical.head(5)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [309]:
cnames = adult_categorical.columns.tolist()
cnames

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [310]:
adult_categorical['workclass'].value_counts()

 Private             22286
 Self-emp-not-inc     2499
 Local-gov            2067
 State-gov            1279
 Self-emp-inc         1074
 Federal-gov           943
 Without-pay            14
Name: workclass, dtype: int64

In [311]:
adult_categorical['education'].value_counts()

 HS-grad         9840
 Some-college    6678
 Bachelors       5044
 Masters         1627
 Assoc-voc       1307
 11th            1048
 Assoc-acdm      1008
 10th             820
 7th-8th          557
 Prof-school      542
 9th              455
 12th             377
 Doctorate        375
 5th-6th          288
 1st-4th          151
 Preschool         45
Name: education, dtype: int64

In [312]:
adult['marital-status'].value_counts()

 Married-civ-spouse       14065
 Never-married             9726
 Divorced                  4214
 Separated                  939
 Widowed                    827
 Married-spouse-absent      370
 Married-AF-spouse           21
Name: marital-status, dtype: int64

In [313]:
adult_categorical['occupation'].value_counts()

 Prof-specialty       4038
 Craft-repair         4030
 Exec-managerial      3992
 Adm-clerical         3721
 Sales                3584
 Other-service        3212
 Machine-op-inspct    1966
 Transport-moving     1572
 Handlers-cleaners    1350
 Farming-fishing       989
 Tech-support          912
 Protective-serv       644
 Priv-house-serv       143
 Armed-Forces            9
Name: occupation, dtype: int64

In [314]:
adult['relationship'].value_counts()

 Husband           12463
 Not-in-family      7726
 Own-child          4466
 Unmarried          3212
 Wife               1406
 Other-relative      889
Name: relationship, dtype: int64

In [315]:
adult['race'].value_counts()

 White                 25933
 Black                  2817
 Asian-Pac-Islander      895
 Amer-Indian-Eskimo      286
 Other                   231
Name: race, dtype: int64

In [316]:
adult['sex'].value_counts()

 Male      20380
 Female     9782
Name: sex, dtype: int64

In [317]:
adult['native-country'].value_counts()

 United-States                 27504
 Mexico                          610
 Philippines                     188
 Germany                         128
 Puerto-Rico                     109
 Canada                          107
 El-Salvador                     100
 India                           100
 Cuba                             92
 England                          86
 Jamaica                          80
 South                            71
 China                            68
 Italy                            68
 Dominican-Republic               67
 Vietnam                          64
 Guatemala                        63
 Japan                            59
 Poland                           56
 Columbia                         56
 Haiti                            42
 Taiwan                           42
 Iran                             42
 Portugal                         34
 Nicaragua                        33
 Peru                             30
 Greece                           29
 

In [319]:
adult.groupby('workclass').agg(['mean', lambda x: trim_mean(x, 0.2), 'median', 'std']).T

Unnamed: 0,workclass,Federal-gov,Local-gov,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
age,mean,42.577943,41.743106,36.794355,46.027933,45.011605,39.362002,47.785714
age,<lambda_0>,42.433862,41.235294,35.464403,45.595975,44.182545,38.738622,49.1
age,median,43.0,41.0,35.0,46.0,44.0,39.0,57.0
age,std,11.53284,12.267402,12.842129,12.689685,13.364069,12.386871,21.07561
fnlwgt,mean,185717.107105,188422.886309,192719.985776,175775.219739,175222.506603,184440.42455,174267.5
fnlwgt,<lambda_0>,171778.003527,178795.264303,180036.951914,163314.314241,165205.450366,170642.979194,172164.1
fnlwgt,median,176904.0,178383.0,181059.5,164614.0,168098.0,170091.0,171531.5
fnlwgt,std,117766.713221,100654.252367,105981.547011,97355.381441,100657.849119,111202.987584,85536.385921
education-num,mean,10.948038,11.036768,9.871085,11.167598,10.211285,11.349492,9.071429
education-num,<lambda_0>,10.749559,11.099114,9.76892,11.071207,10.094604,11.240572,9.2


In [320]:
adult.cov()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,>50K
age,172.519419,-106175.3,1.457834,7797.393,319.498166,15.986877,1.374398
fnlwgt,-106175.339198,11162550000.0,-12121.42954,329877.5,-416454.241511,-28966.974882,-409.21021
education-num,1.457834,-12121.43,6.502474,2349.739,82.112167,4.659381,0.369689
capital-gain,7797.393396,329877.5,2349.738644,54853970.0,-96506.651513,7136.545913,708.37522
capital-loss,319.498166,-416454.2,82.112167,-96506.65,163457.172378,253.881355,26.231868
hours-per-week,15.986877,-28966.97,4.659381,7136.546,253.881355,143.520022,1.188729
>50K,1.374398,-409.2102,0.369689,708.3752,26.231868,1.188729,0.186966


# Reference
[1] Kaggle adult census income dataset. Last access: Sept. 2019. url: https://www.kaggle.com/uciml/adult-census-income
[2] [2] Haojun Zhu, Predicting Earning Potential using the Adult Dataset. Dec. 2016. url: https://rstudio-pubs-static.s3.amazonaws.com/235617_51e06fa6c43b47d1b6daca2523b2f9e4.html