# Load and Preprocess Data

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/Users/noah/Desktop/Work on Resume/adult/adult.data', header=None)  # Specify header=None if there’s no header row

# Define column names since the dataset doesn't include a header row
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

# Strip extra spaces from column names
df.columns = df.columns.str.strip()

# Inspect the first few rows of the dataset
print(df.head())

# Check the columns and their names to ensure there's no mismatch
print(df.columns)

# Check for Missing Data

In [None]:
# Check for missing values
print(df.isnull().sum())

# Impute missing values (if any) for categorical columns with the most frequent value
from sklearn.impute import SimpleImputer

# Imputer for categorical variables (works for 'workclass', 'occupation', etc.)
imputer = SimpleImputer(strategy="most_frequent")
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# After imputation, check again for missing values
print(df.isnull().sum())


In [None]:
# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, drop_first=True)

# Check the first few rows of the encoded dataframe
print(df_encoded.head())

#  Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target (y)
X = df.drop('income', axis=1)
y = df['income']

# Label encode the target variable 'income' (since it's categorical)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode income as 0 (<=50K) and 1 (>50K)


# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing pipelines for both numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # For missing numerical values
    ('scaler', StandardScaler())  # Scaling numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # For missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encoding for categorical features
])

# Combine both pipelines in a column transformer
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Apply the preprocessor to the data
X_processed = preprocessor.fit_transform(X)

# Check the transformed feature set
print(X_processed[:5])



# Split the Data into Training and Test Sets

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Ensure correct separation of features and target
X = df.drop('income', axis=1)
y = df['income']

# Check if the number of rows in X and y are equal
print(len(X), len(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes after the split
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


#  Model Training and Evaluation

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ValueError: could not convert string to float: ' Local-gov'

# Advanced Models (Random Forest, XGBoost)

In [42]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

Random Forest Accuracy: 0.8610471364962383


32561 32561


32561 32561


(26048, 22144)
(32561,)


# Evaluation and Insights

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Plot confusion matrix
cm = confusion_matrix(y_test, xgb_pred)  # Replace xgb_pred with the model's predictions
plt.matshow(cm, cmap='Blues')
plt.colorbar()
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
