In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pytz
import sklearn
import scipy

In [14]:
#check versions of packages
import sys
print(sys.version)
print('pandas version:', pd.__version__)
print('numpy version:', np.__version__)
print('seaborn:', sns.__version__)
print('re version:', re.__version__)
print('pytz version:', pytz.__version__)
print('sklearn version:', sklearn.__version__)
print('scipy version:', scipy.__version__)

3.11.8 | packaged by Anaconda, Inc. | (main, Feb 26 2024, 21:34:05) [MSC v.1916 64 bit (AMD64)]
pandas version: 2.2.1
numpy version: 1.26.4
seaborn: 0.12.2
re version: 2.2.1
pytz version: 2023.3.post1
sklearn version: 1.2.2
scipy version: 1.11.4


In [15]:
data = pd.read_csv('CPS.csv')

In [16]:
# Select and rename relevant columns for analysis
selected_columns = {
    # Household Information
    'hrhhid2': 'household_id',
    'HUFINAL': 'interview_outcome',
    'hrnumhou': 'household_members',  # Number of members in a household

    # Education and Certification Details
    'peschlvl': 'highest_level_of_school',  # Highest level of school 
    'peeduca': 'education_level',  # Highest level of education completed
    'peedegr': 'highest_degree_earned',  # Highest degree earned
    'petrain': 'received_job_training',  # Received job training past high school
    'pecert1': 'professional_certification1',  # Professional certification 1
    'pecert2': 'cert_issued_by_state_or_gov',  # Is certificate issued by the state or the federal government?
    'pecert3': 'is_cert_required',  # Is certification needed for the job?

    # Employment and Income Details
    'pemlr': 'employment_status',  # Employment status
    'prunedor': 'duration_of_unemployment',  # Duration of unemployment
    'hefaminc': 'family_income',  # Family income
    'primind1': 'primary_industry_code_job1',  # Primary industry code for job 1
    'peio1icd': 'industry_code_job1',  # Industry code for job 1
    'ptio1ocd': 'occupation_code_job1',  # Occupation code for job 1
    'primind2': 'primary_industry_code_job2',  # Primary industry code for job 2
    'peio2icd': 'industry_code_job2',  # Industry code for job 2
    'ptio2ocd': 'occupation_code_job2',  # Occupation code for job 2
    'puwk': 'did_work_last_week',  # Did the individual work last week
    
    # Earnings
    'peernhry': 'hourly_earnings',  # Hourly earnings
    'pternh1c': 'earnings_first_job',  # Earnings from the first job
    'pthr': 'total_hours_worked',  # Total hours worked
    'pternwa': 'weekly_earnings',  # Weekly earnings

    # Demographic Variables
    'ptdtrace': 'race',  # Race
    'pesex': 'gender',  # Gender
    'prcitshp': 'citizenship_status',  # Citizenship status

     # Work Arrangement and Preferences
    'pehruslt': 'usual_hours_worked',  # Usual hours worked per week
    'pewa': 'work_at_home',  # Ability or frequency of working from home
    'pewm': 'work_multiple_jobs',  # Working multiple jobs
    
   
     # Job Search and Unemployment Details
    'pruntype': 'reason_for_unemployment',  # Reason for unemployment
    'pulaydtx': 'weeks_looking_for_work',  # Weeks spent looking for work
    'pulkm1': 'primary_method_of_job_search',  # Primary method of job search
    'puhrck12': 'job_search_method',  # Job search method

     # Health and Disability
    'pedisabl': 'disability_status',  # Disability status
    'pehspnon': 'health_status_impacting_work',  # Health status impacting work
    
}

In [17]:
# Filter the dataset to keep only the selected columns, verifying they exist
data_filtered = data[[col for col in selected_columns if col in data.columns]].rename(columns=selected_columns)

In [18]:
data_filtered

Unnamed: 0,household_id,interview_outcome,household_members,highest_level_of_school,education_level,professional_certification1,cert_issued_by_state_or_gov,is_cert_required,employment_status,family_income,...,hourly_earnings,earnings_first_job,total_hours_worked,weekly_earnings,race,gender,citizenship_status,usual_hours_worked,reason_for_unemployment,health_status_impacting_work
0,15011,201,3,-1.0,37.0,2.0,-1.0,-1.0,4.0,12,...,-1.0,-1.0,0,-1.0,1.0,2.0,1.0,-1.0,2.0,2.0
1,15011,201,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,12,...,-1.0,-1.0,0,-1.0,1.0,1.0,1.0,-1.0,-1.0,2.0
2,15011,201,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,12,...,-1.0,-1.0,0,-1.0,1.0,2.0,1.0,-1.0,-1.0,2.0
3,16011,201,1,-1.0,40.0,2.0,-1.0,-1.0,1.0,13,...,1.0,-1.0,0,115400.0,1.0,2.0,1.0,40.0,-1.0,2.0
4,15011,201,2,-1.0,40.0,2.0,-1.0,-1.0,1.0,16,...,1.0,-1.0,0,72000.0,1.0,1.0,1.0,40.0,-1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126779,17111,218,0,,,,,,,-1,...,,,0,,,,,,,
126780,15111,218,5,,,,,,,-1,...,,,0,,,,,,,
126781,15111,218,0,,,,,,,-1,...,,,0,,,,,,,
126782,15111,218,5,,,,,,,-1,...,,,0,,,,,,,
