In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore') 

In [18]:
columns = ['age','workclass','nn','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain',
           'capital_loss','hrs_week','native_country','salary']
data = pd.read_csv('adult.data', names=columns)
data[data.select_dtypes(['object']).columns] = data.select_dtypes(['object']).apply(lambda x: x.str.strip())
data.head()

Unnamed: 0,age,workclass,nn,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [31]:
data.salary.unique()

array(['<=50K', '>50K'], dtype=object)

1. How many men and women (sex feature) are represented in this dataset?

In [20]:
total = len(data)
men = data['sex'].value_counts().iloc[0]
women = data['sex'].value_counts().iloc[1]

ptg_men = men/total
ptg_women = women/total


print('N° of male adults: %d' % men)
print('Percentage male: %.2f%s' % (ptg_men*100,'%'))

print('N° of female adults: %d' % women)
print('Percentage female: %.2f%s' % (ptg_women*100,'%'))


N° of male adults: 21790
Percentage male: 66.92%
N° of female adults: 10771
Percentage female: 33.08%


2. What is the average age (age feature) of women?

In [21]:
avg_age = data.groupby('sex').mean().loc[:,'age']
print('The average age of women is %.2f years old' % (avg_age.iloc[0]))

The average age of women is 36.86 years old


3. What is the percentage of German citizens (native-country feature)?

In [22]:
german = (data.native_country == 'Germany').sum()
ptg_german = german / total * 100
print('The percentage of german citizens is %.2f%s' % (ptg_german,'%'))

The percentage of german citizens is 0.42%


4-5. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?

In [23]:
high_salary_mean = data[data.salary == '>50K'].age.mean()
high_salary_std = data[data.salary == '>50K'].age.std()
low_salary_mean = data[data.salary == '<=50K'].age.mean()
low_salary_std = data[data.salary == '<=50K'].age.std()

print('High salary age mean: %.2f' % high_salary_mean)
print('High salary age std: %.2f' % high_salary_std)
print('Low salary age mean: %.2f' % low_salary_mean)
print('Low salary age std: %.2f' % low_salary_std)

High salary age mean: 44.25
High salary age std: 10.52
Low salary age mean: 36.78
Low salary age std: 14.02


6. Is it true that people who earn more than 50K have at least high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)

In [24]:
data[data.salary == '>50K'].education.unique()

array(['HS-grad', 'Masters', 'Bachelors', 'Some-college', 'Assoc-voc',
       'Doctorate', 'Prof-school', 'Assoc-acdm', '7th-8th', '12th',
       '10th', '11th', '9th', '5th-6th', '1st-4th'], dtype=object)

It is False

7. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.

In [25]:
data.groupby(['race','sex']).age.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
race,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Amer-Indian-Eskimo,Female,119.0,37.117647,13.114991,17.0,27.0,36.0,46.0,80.0
Amer-Indian-Eskimo,Male,192.0,37.208333,12.049563,17.0,28.0,35.0,45.0,82.0
Asian-Pac-Islander,Female,346.0,35.089595,12.300845,17.0,25.0,33.0,43.75,75.0
Asian-Pac-Islander,Male,693.0,39.073593,12.883944,18.0,29.0,37.0,46.0,90.0
Black,Female,1555.0,37.854019,12.637197,17.0,28.0,37.0,46.0,90.0
Black,Male,1569.0,37.6826,12.882612,17.0,27.0,36.0,46.0,90.0
Other,Female,109.0,31.678899,11.631599,17.0,23.0,29.0,39.0,74.0
Other,Male,162.0,34.654321,11.355531,17.0,26.0,32.0,42.0,77.0
White,Female,8642.0,36.811618,14.329093,17.0,25.0,35.0,46.0,90.0
White,Male,19174.0,39.652498,13.436029,17.0,29.0,38.0,49.0,90.0


The max age for that group is 82

8. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.

In [26]:
data.marital_status.unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [33]:
male = data[data.sex == 'Male'] #and data.marital_status.all('Married-civ-spouse','Married-spouse-absent','Married-AF-spouse')]
maried = male[(male.marital_status == 'Married-civ-spouse') | (male.marital_status == 'Married-AF-spouse') | 
              (male.marital_status == 'Married-spouse-absent')]
single = male[~male.index.isin(maried.index)]

maried_low = len(maried[maried.salary == '<=50K'])
maried_high = len(maried[maried.salary == '>50K'])
single_low = len(single[single.salary == '<=50K'])
single_high = len(single[single.salary == '>50K'])

print(maried_low,maried_high)
print(single_low,single_high)
print('Among the maried males the proportion is greater')

7576 5965
7552 697
Among the maried males the proportion is greater


9. What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?

In [39]:
data.describe()
print('The maximum number of hour a person works per week is 99')

max_hours = data[data.hrs_week == 99]
print('The number of people working those hours are: %d' % len(max_hours))

max_hours_high = max_hours[max_hours.salary == '>50K']
print('The percentage of high earners among this people is: %.2f%s' % ((len(max_hours_high)/len(max_hours))*100,'%'))

The maximum number of hour a person works per week is 99
The number of people working those hours are: 85
The percentage of high earners among this people is: 29.41%


10. Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country (native-country). What will these be for Japan?

In [41]:
data.groupby(['native_country','salary']).hrs_week.mean().loc['Japan']

salary
<=50K    41.000000
>50K     47.958333
Name: hrs_week, dtype: float64