# Demographic Data Analyzer

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(r"../data/adult.data.csv")

def rd(value: float) -> str:
    """Shorthand for `round(value, 1)`"""
    return round(value, 1)

### Race Count

In [122]:
df.race.value_counts()

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

### Average age of men

In [123]:
mean_age = df[df['sex'] == 'Male'].age.mean()
rd(mean_age)

39.4

### Percentage of people with a bachelor's degree

In [124]:
bachelors_frac = df.education.value_counts(normalize = True).loc['Bachelors']
rd(bachelors_frac * 100)

16.4

### Percentage of people with advanced education (Bachelor's and above) making more than $50K

In [125]:
degrees = ['Bachelors', 'Masters', 'Doctorate']

In [126]:
he_frac = df.salary[df.education.isin(degrees)].value_counts(normalize = True).loc['>50K']
rd(he_frac * 100)

46.5

### Percentage of people without advanced education (Bachelor's and above) making more than $50K

In [127]:
le_frac = df.salary[~df.education.isin(degrees)].value_counts(normalize = True).loc['>50K']
rd(le_frac * 100)

17.4

### Minimum number of hours a person works per week

In [128]:
df['hours-per-week'].min()

1

### Percentage of the people who work the minimum number of hours per week having a salary of >50K

In [129]:
rich_frac = df[df['hours-per-week'] == df['hours-per-week'].min()].salary.value_counts(normalize = True).loc['>50K']
rd(rich_frac * 100)

10.0

### Country with the highest percentage of people that earn >50K?

In [130]:
highest_rich_series = df[df['salary'] == '>50K']['native-country'].value_counts() / df['native-country'].value_counts()
highest_rich_country = highest_rich_series.idxmax()
highest_rich_country

'Iran'

In [131]:
rd(highest_rich_series.loc[highest_rich_country] * 100)

41.9

### Most popular occupation for those who earn >50K in India.

In [132]:
df[(df['salary'] == '>50K') & (df['native-country'] == 'India')].occupation.value_counts().idxmax()

'Prof-specialty'