In [1]:
# Install and load the necessary libraries
library(readr)
library(tidyverse)
library(tidymodels)
options(repr.matrix.max.rows = 6)

zip_file_path <- "census+income.zip"

extracted_dir <- "data/"

# Unzip the file
unzip(zip_file_path, exdir = extracted_dir)

# Read the adult.data file into a dataframe
adult_data_path <- file.path(extracted_dir, "adult.data")
adult_df <- read.table(adult_data_path, header = FALSE, sep = ",", quote = "\"")

# Add appropriate column names
colnames(adult_df) <- c('age','working_class','financial_weight','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hrs_per_week','native_country','income')

# Display the first few rows of the dataframe
head(adult_df)

# Create test and train splits
adult_split = initial_split(adult_df, prop = 0.75, strata = income)
adult_training = training(adult_split)
adult_testing = testing(adult_split)

# Create a table summarizing the number of observations for each class in income, sex, race
summary_table <- adult_training |>
  group_by(income, sex, race) |>
  summarize(count = n())

# Display the summary table
print("Summary Table:")
print(summary_table)

# Produce means for age, education_num, hours_per_week (vars we will likely use)
means_table <- adult_training |>
    group_by(income, sex, race) |>
    summarize(
        mean_age = mean(age),
        mean_education_num = mean(education_num),
        mean_hours_per_week = mean(hrs_per_week)
      )

print("Means Table:")
print(means_table)

# Find the number of rows with missing values
missing_rows_training <- adult_training %>%
  summarise_all(~sum(is.na(.)))

# Display the number of rows with missing values
print("Number of Rows with Missing Values:")
print(sum(missing_rows_training))

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mpurrr    [39m 1.0.2
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.5     [32m✔[39m [34mrsample     [39m 1.2.0
[32m✔[39m [34mdials       [39m 

Unnamed: 0_level_0,age,working_class,financial_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
Unnamed: 0_level_1,<int>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>
1,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
2,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
3,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
4,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
5,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
6,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


[1m[22m`summarise()` has grouped output by 'income', 'sex'. You can override using the
`.groups` argument.


[1] "Summary Table:"
[90m# A tibble: 20 × 4[39m
[90m# Groups:   income, sex [4][39m
   income   sex       race                  count
   [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m     [3m[90m<chr>[39m[23m                 [3m[90m<int>[39m[23m
[90m 1[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Amer-Indian-Eskimo[90m"[39m    82
[90m 2[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Asian-Pac-Islander[90m"[39m   222
[90m 3[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Black[90m"[39m               [4m1[24m078
[90m 4[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Other[90m"[39m                 71
[90m 5[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m White[90m"[39m               [4m5[24m712
[90m 6[39m [90m"[39m <=50K[90m"[39m [90m"[39m Male[90m"[39m   [90m"[39m Amer-Indian-Eskimo[90m"[39m   13

[1m[22m`summarise()` has grouped output by 'income', 'sex'. You can override using the
`.groups` argument.


[1] "Means Table:"
[90m# A tibble: 20 × 6[39m
[90m# Groups:   income, sex [4][39m
   income   sex       race       mean_age mean_education_num mean_hours_per_week
   [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m     [3m[90m<chr>[39m[23m         [3m[90m<dbl>[39m[23m              [3m[90m<dbl>[39m[23m               [3m[90m<dbl>[39m[23m
[90m 1[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Amer-In…     37.2               9.21                35.5
[90m 2[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Asian-P…     33.7              10.3                 37.4
[90m 3[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Black[90m"[39m       37.9               9.44                36.7
[90m 4[39m [90m"[39m <=50K[90m"[39m [90m"[39m Female[90m"[39m [90m"[39m Other[90m"[39m       29.6               8.89                35.1
[90m 5[39m [90m"[39m <=50K[90m"[39m [90m"[39m 