In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max.columns", 100)
# to draw pictures in jupyter notebook
%matplotlib inline
# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

In [2]:
data_path = "../mlcourse.ai_Dataset/"
data = pd.read_csv(data_path + 'adult.data.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
### Task 1

print(f'The count of male and female are: \n {data.sex.value_counts()}')

The count of male and female are: 
 sex
Male      21790
Female    10771
Name: count, dtype: int64


In [4]:
### Task 2

print(f'The average age of woman: {data[data['sex']=='Female']['age'].mean():.2f}')

The average age of woman: 36.86


In [5]:
### Task 3

print(f'The distribution of Germans citizens: {data['native-country'].value_counts(normalize=True)['Germany']*100:.2f}%')

The distribution of Germans citizens: 0.42%


In [6]:
### Task 4-5

age_gt_50k = data[data['salary']=='>50K']['age']

print(f'For adults who earn >50k the mean age is: {round(age_gt_50k.mean(),0)} and std: {age_gt_50k.std()}')

age_lte_50k = data[data['salary']=='<=50K']['age']

print(f'For adults who earn <=50k the mean age is: {round(age_lte_50k.mean(),0)} and std: {age_lte_50k.std()}')

For adults who earn >50k the mean age is: 44.0 and std: 10.519027719851826
For adults who earn <=50k the mean age is: 37.0 and std: 14.02008849082488


In [7]:
### Task 6

hs_education = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']

unique_education = data[data['salary']==">50K"]['education'].unique().tolist()

print(f'Do people with salary >50K have only these degrees? {len(set(unique_education)-set(hs_education))==0}')

Do people with salary >50K have only these degrees? False


In [8]:
### Task 7

groupby = data.groupby(by=['race', 'sex'])['age']

print(f'Age statistic for each race and gender:\n {groupby.describe()}')

max_male_eskimo = groupby.get_group(('Amer-Indian-Eskimo', 'Male')).max()

print(f'The maximum age of men of "Amer-Indian-Eskimo": {max_male_eskimo}')

Age statistic for each race and gender:
                              count       mean        std   min   25%   50%  \
race               sex                                                       
Amer-Indian-Eskimo Female    119.0  37.117647  13.114991  17.0  27.0  36.0   
                   Male      192.0  37.208333  12.049563  17.0  28.0  35.0   
Asian-Pac-Islander Female    346.0  35.089595  12.300845  17.0  25.0  33.0   
                   Male      693.0  39.073593  12.883944  18.0  29.0  37.0   
Black              Female   1555.0  37.854019  12.637197  17.0  28.0  37.0   
                   Male     1569.0  37.682600  12.882612  17.0  27.0  36.0   
Other              Female    109.0  31.678899  11.631599  17.0  23.0  29.0   
                   Male      162.0  34.654321  11.355531  17.0  26.0  32.0   
White              Female   8642.0  36.811618  14.329093  17.0  25.0  35.0   
                   Male    19174.0  39.652498  13.436029  17.0  29.0  38.0   

                      

In [9]:
### Task 8

married = ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']

married_men = data[(data['marital-status'].isin(married)) & (data['sex']=='Male')]

non_married_men = data[~(data['marital-status'].isin(married)) & (data['sex']=='Male')]

salary_gt_50K_married_men = married_men['salary'].value_counts(normalize=True)['>50K']

salary_gt_50K_non_married_men = non_married_men["salary"].value_counts(normalize=True)['>50K']

if salary_gt_50K_married_men> salary_gt_50K_non_married_men:
    print(f'Men who are married tend to earn more: {salary_gt_50K_married_men} vs who is not married: {salary_gt_50K_non_married_men}')
else:
    print(f'Men who are not married tend to earn more: {salary_gt_50K_non_married_men} vs who is married: {salary_gt_50K_married_men}')


Men who are married tend to earn more: 0.4405139945351156 vs who is not married: 0.08449509031397745


In [10]:
### Task 9

max_work_hours = data['hours-per-week'].max()

people_who_work_max_hours = data[data['hours-per-week']==max_work_hours]

distribution_of_salary = people_who_work_max_hours['salary'].value_counts(normalize=True)

print(f'The maximum # of hours of work per week is: {max_work_hours}')

print(f'People who work such # of hours {people_who_work_max_hours.shape[0]}')

print(f'Percentage of those who earn ">50K" {distribution_of_salary['>50K']*100:.2f}%')

The maximum # of hours of work per week is: 99
People who work such # of hours 85
Percentage of those who earn ">50K" 29.41%


In [11]:
### Task 10

groupby = data.groupby(by=['native-country', 'salary'])['hours-per-week'].mean()

print(f'Count of average work hours based on "native-country" and their "salary":\n {groupby}\n')

print(f'Data for "Japan": \n{groupby.loc[pd.IndexSlice['Japan'],:]}')

Count of average work hours based on "native-country" and their "salary":
 native-country  salary
?               <=50K     40.164760
                >50K      45.547945
Cambodia        <=50K     41.416667
                >50K      40.000000
Canada          <=50K     37.914634
                            ...    
United-States   >50K      45.505369
Vietnam         <=50K     37.193548
                >50K      39.200000
Yugoslavia      <=50K     41.600000
                >50K      49.500000
Name: hours-per-week, Length: 82, dtype: float64

Data for "Japan": 
salary
<=50K    41.000000
>50K     47.958333
Name: hours-per-week, dtype: float64
