In [40]:
import pathlib
import urllib
import sys
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, iqr, skew, kurtosis

In [41]:
path = pathlib.Path('../data/adult.csv')
if path.exists():
    print('adult dataset found!')
else:
    sys.stdout.write('Downloading the adult dataset from the Internet...')
    ADULTURL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    urllib.request.urlretrieve(ADULTURL, path.absolute())
    sys.stdout.write('Done!')

Downloading the adult dataset from the Internet...Done!

In [42]:
# Load the adult dataset into a Pandas dataframe
adult_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', '>50K, <=50K']
adult = pd.read_csv(path.absolute(),names=adult_columns)

In [43]:
adult.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,">50K, <=50K"
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## 1. Statistics about Numeric values

In [30]:
# select all columns with numeric values
adult_numeric = adult.select_dtypes(include=np.number)

In [31]:
adult_numeric.head(5)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [32]:
cnames = ['mean', 'median', 'mode', 'trimmed mean (p=20%)', 'min', 'max', 'range', 'std']
adult_numeric.agg(lambda x: pd.Series([np.mean(x), np.median(x),x.mode()[0], trim_mean(x, 0.2),x.min(),x.max(),x.max()-x.min(),x.std()], index=cnames))

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
median,37.0,178356.0,10.0,0.0,0.0,40.0
mode,36.0,123011.0,9.0,0.0,0.0,40.0
trimmed mean (p=20%),37.348518,177541.6,9.996878,0.0,0.0,40.590725
min,17.0,12285.0,1.0,0.0,0.0,1.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0
range,73.0,1472420.0,15.0,99999.0,4356.0,98.0
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429


## 2. Statistics about Categorical values

In [33]:
# select all columns with categorical values
adult_categorical = adult.select_dtypes(include=['object'])

In [34]:
adult_categorical.head(5)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,">50K, <=50K"
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [39]:
# cnames = adult_categorical.columns.tolist()
# cnames
adult_categorical.loc['?' in adult_categorical['workclass']]
# for i in range(len(adult_categorical.index)):
#     if '?' in adult_categorical.iloc[i]['workclass']:
#         print(adult_categorical.iloc[i])

KeyError: False

In [36]:
for cname in cnames:
    
    pass

In [37]:

pass