In [3]:
import pandas as pd


try:
    data = pd.read_csv('adult.data.csv')
except FileNotFoundError:
    print("Error: The file 'adult.data.csv' was not found. Please ensure it's in the correct directory.")
    exit() 

data.columns = data.columns.str.strip()


for column in data.select_dtypes(include='object').columns:
    data[column] = data[column].str.strip()

print("--- Data Loading and Initial Inspection ---")
print("First 5 rows of the dataset:")
print(data.head())
print("\nDataFrame Information (Dtype, Non-Null Count):")
print(data.info())
print("\n" + "="*50 + "\n")

--- Data Loading and Initial Inspection ---
First 5 rows of the dataset:
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country salary  
0          2174             0          

In [4]:
# Task 1: Count of men and women (feature 'sex')
print("--- Task 1: Count of men and women ---")
sex_counts = data['sex'].value_counts()
print("Number of individuals by sex:")
print(sex_counts)
print("\n" + "="*50 + "\n")


--- Task 1: Count of men and women ---
Number of individuals by sex:
sex
Male      21790
Female    10771
Name: count, dtype: int64




In [5]:
# Task 2: Average age of men (feature 'age')
print("--- Task 2: Average age of men ---")
# Filter the DataFrame for 'Male' sex and then calculate the mean of 'age'.
average_age_men = data[data['sex'] == 'Male']['age'].mean()
print(f"The average age of men in the dataset is: {average_age_men:.2f} years")
print("\n" + "="*50 + "\n")


--- Task 2: Average age of men ---
The average age of men in the dataset is: 39.43 years




In [6]:
# Task 3: Proportion of United States citizens (feature 'native-country')
print("--- Task 3: Proportion of United States citizens ---")
total_citizens = len(data) # Total number of rows/individuals in the dataset.
# Count individuals whose 'native-country' is 'United-States'.
us_citizens = (data['native-country'] == 'United-States').sum()
us_proportion = us_citizens / total_citizens # Calculate the proportion.
print(f"The proportion of United States citizens in the dataset is: {us_proportion:.2f}")
print("\n" + "="*50 + "\n")

--- Task 3: Proportion of United States citizens ---
The proportion of United States citizens in the dataset is: 0.90




In [7]:
# Task 4-5: Calculate mean and standard deviation of age for those earning >50K and <=50K (feature 'salary')
print("--- Task 4-5: Age statistics for high and low earners ---")
# Group by 'salary' and then apply 'mean' and 'std' aggregations to the 'age' column.
salary_age_stats = data.groupby('salary')['age'].agg(['mean', 'std'])
print("Average age and standard deviation for salary groups:")
print(salary_age_stats)
print("\n" + "="*50 + "\n")

--- Task 4-5: Age statistics for high and low earners ---
Average age and standard deviation for salary groups:
             mean        std
salary                      
<=50K   36.783738  14.020088
>50K    44.249841  10.519028




In [8]:
# Task 6: Check if people earning >50K have at least a higher education
# (features 'education' and 'salary')
print("--- Task 6: Do high earners have higher education? ---")
high_earners = data[data['salary'] == '>50K']
# Define the list of higher education levels as per the assignment description.
higher_education_levels = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']
# Check if ALL high earners' education is within the defined higher_education_levels.
all_higher_education = high_earners['education'].isin(higher_education_levels).all()
print(f"Is it true that all high earners have at least a higher education? {all_higher_education}")
print("Note: 'True' means *all* satisfy the condition; 'False' means at least one does not.")
print("\n" + "="*50 + "\n")


--- Task 6: Do high earners have higher education? ---
Is it true that all high earners have at least a higher education? False
Note: 'True' means *all* satisfy the condition; 'False' means at least one does not.




In [9]:
# Task 7: Age statistics for each race and sex using groupby and describe.
# Find the maximum age of men of 'Asian-Pac-Islander' race.
print("--- Task 7: Age statistics by race and sex ---")
# Group by 'race' and 'sex' and then use .describe() on the 'age' column
age_stats_by_race_sex = data.groupby(['race', 'sex'])['age'].describe()
print("Age statistics grouped by race and sex:")
print(age_stats_by_race_sex)
# Access the maximum age for 'Asian-Pac-Islander' males using .loc
max_age_asian_male = age_stats_by_race_sex.loc[('Asian-Pac-Islander', 'Male'), 'max']
print(f"\nMaximum age of men of Asian-Pac-Islander race: {max_age_asian_male:.0f}")
print("\n" + "="*50 + "\n")


--- Task 7: Age statistics by race and sex ---
Age statistics grouped by race and sex:
                             count       mean        std   min   25%   50%  \
race               sex                                                       
Amer-Indian-Eskimo Female    119.0  37.117647  13.114991  17.0  27.0  36.0   
                   Male      192.0  37.208333  12.049563  17.0  28.0  35.0   
Asian-Pac-Islander Female    346.0  35.089595  12.300845  17.0  25.0  33.0   
                   Male      693.0  39.073593  12.883944  18.0  29.0  37.0   
Black              Female   1555.0  37.854019  12.637197  17.0  28.0  37.0   
                   Male     1569.0  37.682600  12.882612  17.0  27.0  36.0   
Other              Female    109.0  31.678899  11.631599  17.0  23.0  29.0   
                   Male      162.0  34.654321  11.355531  17.0  26.0  32.0   
White              Female   8642.0  36.811618  14.329093  17.0  25.0  35.0   
                   Male    19174.0  39.652498  13.43602

In [10]:
# Task 8: Compare proportion of high earners among married vs. single men
# (features 'marital-status', 'sex', 'salary')
print("--- Task 8: Proportion of high earners among married vs. single men ---")
# Filter for only male individuals. Using .copy() to avoid SettingWithCopyWarning.
male_data = data[data['sex'] == 'Male'].copy()
# Create a simplified marital status column: 'Married' if it starts with 'Married', else 'Single'.
male_data['marital_status_simplified'] = male_data['marital-status'].apply(
    lambda x: 'Married' if x.startswith('Married') else 'Single'
)

# Separate married and single men.
married_men = male_data[male_data['marital_status_simplified'] == 'Married']
single_men = male_data[male_data['marital_status_simplified'] == 'Single']

# Calculate the proportion of high earners in each group.
proportion_married = (married_men['salary'] == '>50K').sum() / len(married_men)
proportion_single = (single_men['salary'] == '>50K').sum() / len(single_men)

print(f"Proportion of high earners among married men: {proportion_married:.2f}")
print(f"Proportion of high earners among single men: {proportion_single:.2f}")

if proportion_married > proportion_single:
    print("The proportion of high earners is greater among married men.")
else:
    print("The proportion of high earners is greater among single men.")
print("\n" + "="*50 + "\n")


--- Task 8: Proportion of high earners among married vs. single men ---
Proportion of high earners among married men: 0.44
Proportion of high earners among single men: 0.08
The proportion of high earners is greater among married men.




In [11]:
# Task 9: Max hours per week, count of people working max hours, and percentage of high earners among them
# (features 'hours-per-week', 'salary')
print("--- Task 9: Max hours per week, count, and high earner percentage ---")
max_hours = data['hours-per-week'].max() # Find the maximum hours worked.
# Count how many people work exactly the maximum hours.
people_with_max_hours = (data['hours-per-week'] == max_hours).sum()
# Count high earners who work the maximum hours.
high_earners_with_max_hours = data[
    (data['hours-per-week'] == max_hours) & (data['salary'] == '>50K')
].shape[0]

# Calculate the percentage of high earners among those working max hours.
percent_high_earners = (high_earners_with_max_hours / people_with_max_hours) * 100

print(f"Maximum hours per week worked by anyone: {max_hours}")
print(f"Number of people working this maximum amount of hours: {people_with_max_hours}")
print(f"Percentage of high earners among those working maximum hours: {percent_high_earners:.2f}%")
print("\n" + "="*50 + "\n")

--- Task 9: Max hours per week, count, and high earner percentage ---
Maximum hours per week worked by anyone: 99
Number of people working this maximum amount of hours: 85
Percentage of high earners among those working maximum hours: 29.41%




In [12]:
# Task 10: Average working hours by country and salary (features 'native-country', 'salary', 'hours-per-week')
print("--- Task 10: Average working hours by country and salary ---")
# Group by 'native-country' and 'salary', then calculate the mean of 'hours-per-week'.
# .unstack() pivots the 'salary' levels into columns for easier comparison.
avg_hours_by_country_salary = data.groupby(['native-country', 'salary'])['hours-per-week'].mean().unstack()
print("Average working hours per week by native country and salary level:")
print(avg_hours_by_country_salary)
print("\n" + "="*50 + "\n")

--- Task 10: Average working hours by country and salary ---
Average working hours per week by native country and salary level:
salary                          <=50K       >50K
native-country                                  
?                           40.164760  45.547945
Cambodia                    41.416667  40.000000
Canada                      37.914634  45.641026
China                       37.381818  38.900000
Columbia                    38.684211  50.000000
Cuba                        37.985714  42.440000
Dominican-Republic          42.338235  47.000000
Ecuador                     38.041667  48.750000
El-Salvador                 36.030928  45.000000
England                     40.483333  44.533333
France                      41.058824  50.750000
Germany                     39.139785  44.977273
Greece                      41.809524  50.625000
Guatemala                   39.360656  36.666667
Haiti                       36.325000  42.750000
Holand-Netherlands          40.000000  

In [13]:
# Task 11: Create AgeGroup column
# Define a function to categorize age into 'young', 'adult', 'retiree'.
print("--- Task 11: Create AgeGroup column ---")
def age_grouper(age):
    if 16 <= age <= 35:
        return 'young'
    elif 35 < age <= 70:
        return 'adult'
    elif 70 < age <= 100:
        return 'retiree'
    return 'other' # For ages outside the defined ranges (e.g., <16 or >100 if any)

# Apply the function to the 'age' column to create the new 'AgeGroup' column.
data['AgeGroup'] = data['age'].apply(age_grouper)
print("First 5 rows showing 'age' and new 'AgeGroup' column:")
print(data[['age', 'AgeGroup']].head())
print("\n" + "="*50 + "\n")

--- Task 11: Create AgeGroup column ---
First 5 rows showing 'age' and new 'AgeGroup' column:
   age AgeGroup
0   39    adult
1   50    adult
2   38    adult
3   53    adult
4   28    young




In [14]:
# Task 12-13: Determine high earners per age group and the group with the highest proportion
# (features 'AgeGroup', 'salary')
print("--- Task 12-13: High earners by age group and most frequent group ---")
# Count high earners in each newly created 'AgeGroup'.
high_earners_by_agegroup = data[data['salary'] == '>50K']['AgeGroup'].value_counts()
print("Number of high earners in each age group:")
print(high_earners_by_agegroup)

# Count total individuals in each age group.
total_by_agegroup = data['AgeGroup'].value_counts()
# Calculate the proportion of high earners within each age group.
# Sort values in descending order to easily find the highest proportion.
proportion_high_earners = (high_earners_by_agegroup / total_by_agegroup).sort_values(ascending=False)
print("\nProportion of high earners within each age group:")
print(proportion_high_earners)

# Identify the age group with the highest proportion of high earners.
most_frequent_group = proportion_high_earners.idxmax()
print(f"\nThe age group with the highest proportion of high earners is: {most_frequent_group}")
print("\n" + "="*50 + "\n")

--- Task 12-13: High earners by age group and most frequent group ---
Number of high earners in each age group:
AgeGroup
adult      6042
young      1705
retiree      94
Name: count, dtype: int64

Proportion of high earners within each age group:
AgeGroup
adult      0.353416
retiree    0.174074
young      0.114238
Name: count, dtype: float64

The age group with the highest proportion of high earners is: adult




In [15]:
# Task 14: Group by occupation and filter groups based on average age and minimum hours per week
# (features 'occupation', 'age', 'hours-per-week')
print("--- Task 14: Filter groups by occupation ---")
# Group the DataFrame by 'occupation'.
grouped_by_occupation = data.groupby('occupation')

# Define the filtering function.
# This function will be applied to each group (occupation) created by groupby.
def filter_func(group):
    avg_age = group['age'].mean() # Calculate the mean age for the current occupation group.
    min_hours = group['hours-per-week'].min() # Find the minimum hours worked for the current occupation group.
    # Return True if both conditions are met: average age <= 40 AND minimum hours > 5.
    return avg_age <= 40 and min_hours > 5

# Apply the filter function to the grouped DataFrame.
# .filter() returns a DataFrame containing only rows from groups that pass the filter_func.
filtered_groups_df = grouped_by_occupation.filter(filter_func)

# To see which occupations were returned, count their occurrences in the filtered DataFrame.
print("Occupations that meet the filtering criteria (average age <= 40 AND all workers work > 5 hours per week):")
# We use value_counts() on the 'occupation' column of the filtered DataFrame
# to get a clear summary of the occupations that passed.
print(filtered_groups_df['occupation'].value_counts())
print("\n" + "="*50 + "\n")


--- Task 14: Filter groups by occupation ---
Occupations that meet the filtering criteria (average age <= 40 AND all workers work > 5 hours per week):
occupation
Armed-Forces    9
Name: count, dtype: int64


