In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
# Load the dataset
url = 'https://github.com/dsrscientist/dataset1/raw/master/census_income.csv'
data = pd.read_csv(url)

In [None]:
# Display the first few rows of the dataset
data.head()

# Check for missing values
data.isnull().sum()

# Data cleaning: remove rows with missing values
data.dropna(inplace=True)

In [None]:
# Rename columns for easier access
data.columns = [col.strip() for col in data.columns]

In [None]:
# Convert target variable to binary
data['income'] = np.where(data['income'] == '>50K', 1, 0)

In [None]:
# Define features and target
X = data.drop('income', axis=1)
y = data['income']


In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns


In [None]:
# Preprocessing for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [None]:
# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")


In [None]:
# Visualize feature importance
importances = model.named_steps['classifier'].feature_importances_
feature_names = np.concatenate((numerical_cols, model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols)))
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)


In [None]:
plt.figure(figsize=(12, 6))
feature_importances.plot(kind='bar')
plt.title('Feature Importance')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()
