# Data Exploration for Census Income Dataset

In [22]:
import pandas as pd
import os
import numpy as np

# Load the CSV file into a DataFrame
file_path = os.path.join("..", "data", "income", "adult.data") # Replace with your actual file path
df = pd.read_csv(file_path)

# Define column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']

# Assign column names to the DataFrame
df.columns = column_names

# Display the first few rows of the DataFrame
#print(df.head(4))
print("\nNumber of Rows: ", df.shape[0])
print("Number of Features: ", df.shape[1])

# Divide the features into numerical and non-numerical lists
# Extract numerical and string features
num_features = df.select_dtypes(include=['number']).columns.tolist()
str_features = df.select_dtypes(include=['object', 'string']).columns.tolist()

# Display the feature lists
print("Numerical Features:", num_features)
print("String Features:", str_features)


Number of Rows:  32560
Number of Features:  15
Numerical Features: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
String Features: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


In [None]:
# Convert income column to binary, flag errors
def convert_income(value):
    value = str(value).strip()
    if value == '>50K':
        return 1
    elif value == '<=50K':
        return 0
    else:
        return np.nan  # Flag invalid values as NaN (or set a custom error flag)

df['income'] = df['income'].apply(convert_income)

# Identify and display rows with errors
error_rows = df[df['income'].isna()]
if not error_rows.empty:
    print("Invalid income values found in ", error_rows.size, "rows: ")
    print(error_rows)



In [24]:
df.head(4)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
