In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
adult_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"], skipinitialspace=True)

# Split the dataset into features and target
X = adult_df.drop('income', axis=1)
y = adult_df['income']

# Encode categorical variables
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Calculate feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_encoded, y)
importances = rf.feature_importances_

# Calculate cumulative importances
cumulative_importances = np.cumsum(importances)

# Find number of features for cumulative importance of 95%
num_features = np.where(cumulative_importances > 0.95)[0][0] + 1

# Extract the names of the most important features
feature_names = list(X.columns)
important_feature_names = [feature_names[i] for i in np.argsort(importances)[-num_features:]]

# Extract the most important features
important_features = X_encoded[:, np.argsort(importances)[-num_features:]]

In [9]:
# Extract the most important features
important_features = X_encoded[:, np.argsort(importances)[-num_features:]]

# Handle the skewed nature of the data
positive_class = np.where(y == '>50K')[0]
negative_class = np.where(y == '<=50K')[0]

# Oversample the minority class
negative_class = np.random.choice(negative_class, size=len(positive_class), replace=True)

# Combine the positive and negative classes
X_train = np.concatenate([important_features[positive_class], important_features[negative_class]], axis=0)
y_train = np.concatenate([y[positive_class], y[negative_class]], axis=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model using the most important features
rf = RandomForestClassifier(n_estimators=5000, random_state=84)
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Calculate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Print the accuracy score and the classification report
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8476251195409628
Classification Report:
 {'<=50K': {'precision': 0.857976653696498, 'recall': 0.8362831858407079, 'f1-score': 0.8469910371318822, 'support': 1582.0}, '>50K': {'precision': 0.8376175548589342, 'recall': 0.8591639871382637, 'f1-score': 0.8482539682539683, 'support': 1555.0}, 'accuracy': 0.8476251195409628, 'macro avg': {'precision': 0.8477971042777162, 'recall': 0.8477235864894859, 'f1-score': 0.8476225026929252, 'support': 3137.0}, 'weighted avg': {'precision': 0.8478847191436093, 'recall': 0.8476251195409628, 'f1-score': 0.8476170677008473, 'support': 3137.0}}


In [10]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the new dataset
test_df = pd.read_csv("adult.test", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"], skipinitialspace=True)
test_df.drop("sex", axis=1, inplace=True)

# Split the new dataset into features and target
X_new = test_df.drop('income', axis=1)
y_new = test_df['income']
y_new = y_new.str.replace('.', '')

encoder_new = OrdinalEncoder()
input_data_encoded = encoder_new.fit_transform(X_new)
# Make a prediction using the trained model
prediction = rf.predict(input_data_encoded)

print(accuracy_score(y_new, prediction))
print(classification_report(y_new, prediction, output_dict=True))

0.7640808304158221
{'<=50K': {'precision': 0.7642356413725249, 'recall': 0.999437072778448, 'f1-score': 0.8661532564379552, 'support': 12435.0}, '>50K': {'precision': 0.631578947368421, 'recall': 0.0031201248049922, 'f1-score': 0.006209573091849935, 'support': 3846.0}, 'accuracy': 0.7640808304158221, 'macro avg': {'precision': 0.6979072943704729, 'recall': 0.5012785987917201, 'f1-score': 0.4361814147649025, 'support': 16281.0}, 'weighted avg': {'precision': 0.7328986445578461, 'recall': 0.7640808304158221, 'f1-score': 0.6630119625279299, 'support': 16281.0}}
