In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

def run_income_classification_project(file_path='adult 3.csv'):
    """
    Runs an employee income classification project using the adult 3.csv dataset.

    This script performs:
    1. Data Loading and Initial Inspection
    2. Data Cleaning (handling '?' as missing values)
    3. Target Variable Preparation (converting income to binary 0/1)
    4. Feature Engineering and Preprocessing (One-Hot Encoding, Standardization)
    5. Model Training (Random Forest Classifier)
    6. Model Evaluation (Accuracy, Classification Report, Confusion Matrix)

    Args:
        file_path (str): The path to the 'adult 3.csv' dataset.
    """
    print(f"--- Starting Employee Income Classification Project with {file_path} ---")

    # --- 1. Data Loading and Initial Inspection ---
    try:
        # Load the dataset
        # The 'adult.csv' dataset typically does not have a header row,
        # so we define column names manually based on common knowledge of the dataset.
        column_names = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income'
        ]
        df = pd.read_csv(file_path, names=column_names, skipinitialspace=True)
        print("\nDataset loaded successfully. First 5 rows:")
        print(df.head())
        print("\nDataset Info:")
        df.info()
        print("\nMissing values (represented as '?'):")
        # Replace '?' with NaN for proper missing value handling
        df = df.replace('?', np.nan)
        print(df.isnull().sum())

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure the file is in the correct directory.")
        return
    except Exception as e:
        print(f"An error occurred during data loading: {e}")
        return

    # --- 2. Data Cleaning and Preprocessing ---

    # Drop 'fnlwgt' as it's typically a sampling weight and not predictive for individual income
    if 'fnlwgt' in df.columns:
        df = df.drop('fnlwgt', axis=1)
        print("\n'fnlwgt' column dropped.")

    # Identify categorical and numerical features
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Remove 'income' from features list as it's the target
    if 'income' in categorical_features:
        categorical_features.remove('income')

    print(f"\nCategorical features: {categorical_features}")
    print(f"Numerical features: {numerical_features}")

    # --- 3. Target Variable Preparation ---
    # Convert 'income' column to a binary target: 1 for '>50K', 0 for '<=50K'
    df['income_over_50k'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)
    X = df.drop('income', axis=1) # Features
    X = X.drop('income_over_50k', axis=1) # Drop the original income column
    y = df['income_over_50k'] # Target

    print("\nTarget variable 'income_over_50k' created (1 for >50K, 0 for <=50K).")
    print(f"Target distribution:\n{y.value_counts()}")

    # --- 4. Feature Engineering and Preprocessing Pipeline ---
    # Create preprocessing pipelines for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')), # Impute missing numerical values with median
        ('scaler', StandardScaler()) # Scale numerical features
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')), # Impute missing categorical values with mode
        ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical features
    ])

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # --- 5. Model Training ---
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"\nData split into training ({len(X_train)} samples) and testing ({len(X_test)} samples).")

    # Create the full pipeline: preprocessing + model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))])

    print("\nTraining the Random Forest Classifier...")
    model_pipeline.fit(X_train, y_train)
    print("Model training complete.")

    # --- 6. Model Evaluation ---
    print("\nEvaluating the model on the test set...")
    y_pred = model_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\n--- Project Finished ---")

# Import SimpleImputer after defining run_income_classification_project
# to ensure it's available for the pipeline.
from sklearn.impute import SimpleImputer

# Run the project
if __name__ == "__main__":
    run_income_classification_project()


--- Starting Employee Income Classification Project with adult 3.csv ---

Dataset loaded successfully. First 5 rows:
   age  workclass  fnlwgt     education    education-num      marital-status  \
0  age  workclass  fnlwgt     education  educational-num      marital-status   
1   25    Private  226802          11th                7       Never-married   
2   38    Private   89814       HS-grad                9  Married-civ-spouse   
3   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
4   44    Private  160323  Some-college               10  Married-civ-spouse   

          occupation  relationship   race     sex  capital-gain  capital-loss  \
0         occupation  relationship   race  gender  capital-gain  capital-loss   
1  Machine-op-inspct     Own-child  Black    Male             0             0   
2    Farming-fishing       Husband  White    Male             0             0   
3    Protective-serv       Husband  White    Male             0             0 