In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('./adult.data.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [29]:
# How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
race_count = df["race"].value_counts().values.tolist()
race_count

[27816, 3124, 1039, 311, 271]

In [6]:
# What is the average age of men?
average_age_men = df[df["sex"] == "Male"]["age"].mean().round(1)
average_age_men

39.4

In [8]:
# What is the percentage of people who have a Bachelor's degree?
percentage_bachelors = round((df[df["education"] == "Bachelors"].shape[0] / df.shape[0]) * 100, 1)
percentage_bachelors

16.4

In [21]:
 # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
# What percentage of people without advanced education make more than 50K?
# with and without `Bachelors`, `Masters`, or `Doctorate`
higher_education = df[df["education"].isin(['Bachelors', 'Masters', 'Doctorate'])]
lower_education = df[~df["education"].isin(['Bachelors', 'Masters', 'Doctorate'])]

higher_education_rich = round((higher_education[higher_education["salary"] == ">50K"].shape[0] / higher_education.shape[0]) * 100, 1)
lower_education_rich = round((lower_education[lower_education["salary"] == ">50K"].shape[0] / lower_education.shape[0]) * 100, 1)

print(higher_education_rich)
print(lower_education_rich)

46.5
17.4


In [12]:
# What is the minimum number of hours a person works per week (hours-per-week feature)?
min_work_hours = df["hours-per-week"].min()
min_work_hours

1

In [20]:
# What percentage of the people who work the minimum number of hours per week have a salary of >50K?
num_min_workers = df[df["hours-per-week"] == min_work_hours]
rich_percentage = int((num_min_workers[num_min_workers["salary"] == ">50K"].shape[0] / num_min_workers.shape[0]) * 100)

rich_percentage

10

In [9]:
# What country has the highest percentage of people that earn >50K?

'''
Grouping and Counting: The size() method counts the occurrences of each combination of native-country and salary. 
The unstack() method reshapes the DataFrame to have salary values as columns.
'''
gc = df.groupby(['native-country', 'salary']).size().unstack(fill_value=0)

# Step 2: Calculate the total number of people from each country
gc['total'] = gc.sum(axis=1)

gc['percentage_earning_above_50K'] = (gc['>50K'] / gc['total']) * 100

highest_earning_country = gc[gc['percentage_earning_above_50K'] == gc['percentage_earning_above_50K'].max()].index.tolist()[0]
highest_earning_country_percentage = gc['percentage_earning_above_50K'].max().round(1)

print(highest_earning_country)
print(highest_earning_country_percentage)

Iran
41.9


In [14]:
# Identify the most popular occupation for those who earn >50K in India.
filtered = df[(df["native-country"] == "India") & (df["salary"] == ">50K")]
occupations_IN = filtered.groupby(["occupation"]).size().reset_index(name='count')
max_IN_occupation = occupations_IN["count"].max()
top_IN_occupation = occupations_IN[occupations_IN["count"] == max_IN_occupation].iloc[0]["occupation"]

top_IN_occupation

'Prof-specialty'

In [None]:
def calculate_demographic_data(print_data=True):
    # Read data from file
    df = pd.read_csv('')

    # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
    race_count = None

    # What is the average age of men?
    average_age_men = None

    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = None

    # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
    # What percentage of people without advanced education make more than 50K?

    # with and without `Bachelors`, `Masters`, or `Doctorate`
    higher_education = None
    lower_education = None

    # percentage with salary >50K
    higher_education_rich = None
    lower_education_rich = None

    # What is the minimum number of hours a person works per week (hours-per-week feature)?
    min_work_hours = None

    # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
    num_min_workers = None

    rich_percentage = None

    # What country has the highest percentage of people that earn >50K?
    highest_earning_country = None
    highest_earning_country_percentage = None

    # Identify the most popular occupation for those who earn >50K in India.
    top_IN_occupation = None

    # DO NOT MODIFY BELOW THIS LINE

    if print_data:
        print("Number of each race:\n", race_count) 
        print("Average age of men:", average_age_men)
        print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
        print(f"Percentage with higher education that earn >50K: {higher_education_rich}%")
        print(f"Percentage without higher education that earn >50K: {lower_education_rich}%")
        print(f"Min work time: {min_work_hours} hours/week")
        print(f"Percentage of rich among those who work fewest hours: {rich_percentage}%")
        print("Country with highest percentage of rich:", highest_earning_country)
        print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
        print("Top occupations in India:", top_IN_occupation)

    return {
        'race_count': race_count,
        'average_age_men': average_age_men,
        'percentage_bachelors': percentage_bachelors,
        'higher_education_rich': higher_education_rich,
        'lower_education_rich': lower_education_rich,
        'min_work_hours': min_work_hours,
        'rich_percentage': rich_percentage,
        'highest_earning_country': highest_earning_country,
        'highest_earning_country_percentage':
        highest_earning_country_percentage,
        'top_IN_occupation': top_IN_occupation
    }
