In [84]:
import pandas as pd

In [85]:
df = pd.DataFrame


## Reading Data

In [86]:
df = pd.read_csv('adultdata.csv', index_col = False)

## Checking if the Data needs Cleaning

In [87]:
df.size

488415

In [88]:
df.shape

(32561, 15)

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [90]:
race = df['race'].unique()
race

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)

# Analyzing Data

# Q) How many people of each race are represented in this dataset

In [91]:
df['race'].value_counts()


White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [92]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

# Q)  What is the average age of men


In [93]:
male = df[(df['sex'] == 'Male')]
male.age.mean()

39.43354749885268

# Q) What percentage of people with Bachelors Degree

In [94]:
total_education = len(df['education'])
num_bachelors = len(df[(df['education'] == 'Bachelors')])
round(num_bachelors/ total_education * 100, 1) 

16.4

# Q) What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?

In [96]:
total_advanced_education = df[df['education'].isin(['Bachelors', 'Masters', 'Doctrate'])]
round(len(total_advanced_education[total_advanced_education.salary == '>50K'])/len(total_advanced_education) * 100, 1)


44.9

# Q) What percentage of people without advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?

In [97]:
total_lower_education = df[~df['education'].isin(['Bachelors', 'Masters', 'Doctrate'])]
round(len(total_lower_education[total_lower_education.salary == '>50K'])/len(total_lower_education) * 100, 1)

18.3

# Q) What is the minimum number of hours a person works per week (hours-per-week feature)?

In [98]:
df['hours-per-week'].min()

1

# Q) What percentage of the people who work the minimum number of hours per week have a salary of >50K?

In [99]:
min_hour = df['hours-per-week'].min()
min_hour_people = df[df['hours-per-week'] == min_hour]
round(len(min_hour_people[min_hour_people.salary == '>50K'])/len(min_hour_people) * 100, 1)

10.0

# Q) What country has the highest percentage of people that earn >50K?

In [100]:
country = df['native-country'].value_counts()
country_rich = df[df['salary'] == '>50K']['native-country'].value_counts()
(country_rich / country * 100).idxmax()


'Iran'

In [101]:
(country_rich / country * 100).max()

41.86046511627907

In [106]:
df['native-country'].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [108]:

import pandas as pd

def calculate_demographic_data(print_data=True):
  # Read data from file
  df = pd.read_csv('adultdata.csv')
  
  race_count = df['race'].value_counts()

  # What is the average age of men?
  male = df[(df['sex'] == 'Male')]
  average_age_men = male.age.mean()

  # What is the percentage of people who have a Bachelor's degree?
  total_education = len(df['education'])
  num_bachelors = len(df[(df['education'] == 'Bachelors')])
  percentage_bachelors = round(num_bachelors / total_education * 100, 1) 

  # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`)   make more than 50K?
  

  # What percentage of people without advanced education make more than 50K?

  # with and without `Bachelors`, `Masters`, or `Doctorate`
  higher_education = df[df['education'].isin(['Bachelors', 'Masters', 'Doctrate'])]

  lower_education = df[~df['education'].isin(['Bachelors', 'Masters', 'Doctrate'])]

  # percentage with salary >50K
  higher_education_rich = round(len(higher_education[higher_education.salary == '>50K'])/len(higher_education) * 100, 1)
  
  lower_education_rich = round(len(lower_education[lower_education.salary == '>50K'])/len(lower_education) * 100, 1)

  min_work_hours = df['hours-per-week'].min()

  min_hour = df['hours-per-week'].min()
  min_hour_people = df[df['hours-per-week'] == min_work_hours]

  rich_percentage = round(len(min_hour_people[min_hour_people.salary == '>50K'])/len(min_hour_people) * 100, 1)


  country = df['native-country'].value_counts()
  country_rich = df[df['salary'] == '>50K']['native-country'].value_counts()
  highest_earning_country = (country_rich / country * 100).idxmax()
  highest_earning_country_percentage = round(country_rich / country * 100, 1).max()

  people_of_use = df[(df['native-country'] == 'United-States') & (df['salary'] == '>50K')]
  occupation_count = people_of_use['occupation'].value_counts()
  top_IN_occupation = occupation_count.idxmax()


  if print_data:
      print("Number of each race:\n", race_count) 
      print()
      print("Average age of men:", average_age_men)
      print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
      print()
      print(f"Percentage with higher education that earn >50K: {higher_education_rich}%")
      print()
      print(f"Percentage without higher education that earn >50K: {lower_education_rich}%")
      print()
      print(f"Min work time: {min_work_hours} hours/week")
      print()
      print(f"Percentage of rich among those who work fewest hours: {rich_percentage}%")
      print()
      print("Country with highest percentage of rich:", highest_earning_country)
      print()
      print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
      print()
      print("Top occupations in United States:", top_IN_occupation)


calculate_demographic_data()

Number of each race:
 White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

Average age of men: 39.43354749885268
Percentage with Bachelors degrees: 16.4%

Percentage with higher education that earn >50K: 44.9%

Percentage without higher education that earn >50K: 18.3%

Min work time: 1 hours/week

Percentage of rich among those who work fewest hours: 10.0%

Country with highest percentage of rich: Iran

Highest percentage of rich people in country: 41.9%

Top occupations in United States: Exec-managerial
