<a href="https://colab.research.google.com/github/Thushara-Mohan/demographic_data_analyzer/blob/main/demographic_data_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install ucimlrepo



In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [4]:
import numpy as np
import pandas as pd
df = pd.DataFrame(X)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [7]:
race_counts = df['race'].value_counts()
race_counts

Unnamed: 0_level_0,count
race,Unnamed: 1_level_1
White,41762
Black,4685
Asian-Pac-Islander,1519
Amer-Indian-Eskimo,470
Other,406


In [15]:
avg_age_of_men = df[df['sex'] == 'Male']['age'].mean()
print(f"Average age of men: {avg_age_of_men}")

Average age of men: 39.49439509954058


In [14]:
percentage_bachelors = (df['education'] == 'Bachelors').sum() / len(df) * 100
print(f"Percentage of people who have bachelors: {percentage_bachelors:.2f}%")

Percentage of people who have bachelors: 16.43%


In [20]:
df1 = pd.DataFrame(y)
df1.head()
df['salary'] = df1['income']
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [22]:
# Clean the 'salary' column by removing trailing periods
df['salary'] = df['salary'].str.replace('.', '', regex=False)

# Combine the conditions using the bitwise AND operator (&)
higher_education_high_salary = (
    (df['education'] == 'Bachelors') |
    (df['education'] == 'Masters') |
    (df['education'] == 'Doctorate')
) & (df['salary'] == '>50K')

# Calculate the percentage
percentage = (higher_education_high_salary.sum() / len(df)) * 100

print(f"Percentage of people with higher education and high salary: {percentage:.2f}%")

Percentage of people with higher education and high salary: 10.65%


In [24]:
low_education_high_salary = (
    (df['education'] != 'Bachelors') &
    (df['education'] != 'Masters') &
    (df['education'] != 'Doctorate')
) & (df['salary'] == '>50K')

percentage = (low_education_high_salary.sum() / df.shape[0]) * 100
print(f"Percentage of people with low education and salary above 50k: {percentage:.2f}%")

Percentage of people with low education and salary above 50k: 13.28%


In [25]:
min_work_hours_per_week = df['hours-per-week'].min()
print(f"Minimum number of hours a person works per week: {min_work_hours_per_week}")

Minimum number of hours a person works per week: 1


In [33]:
min_work_more_salary = ((df['hours-per-week'] == min_work_hours_per_week) & (df['salary'] == '>50K')).sum()
print(f"Number of people who work the minimum hours per week and earn more than 50k: {min_work_more_salary}")
percentage = (min_work_more_salary / df.shape[0]) * 100
print(f"Percentage of people who work the minimum hours per week and earn more than 50k: {percentage}")
#

Number of people who work the minimum hours per week and earn more than 50k: 3
Percentage of people who work the minimum hours per week and earn more than 50k: 0.0061422546169280536


In [45]:
country_counts = df['native-country'].value_counts()
rich_country_counts = df[df['salary'] == '>50K']['native-country'].value_counts()
rich_country_percentage = ((rich_country_counts / country_counts) * 100).sort_values(ascending=False)
print(f"Country with highest percentage of people earning >50k: {rich_country_percentage.index[0]}")
print(f"Percentage: {rich_country_percentage.iloc[0]:.4f}")

Country with highest percentage of people earning >50k: France
Percentage: 42.1053


In [50]:
high_sal_india = df[(df['salary'] == '>50K') & (df['native-country'] == 'India')]
pop_occ = high_sal_india['occupation'].value_counts()
print(pop_occ)
print(f"Most Popular occupation for those who earn >50k in India: {pop_occ.index[0]}")

occupation
Prof-specialty      35
Exec-managerial     12
Craft-repair         4
Tech-support         3
Adm-clerical         3
Other-service        2
Sales                2
Transport-moving     1
Name: count, dtype: int64
Most Popular occupation for those who earn >50k in India: Prof-specialty
