In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [41]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
adult_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"], skipinitialspace=True)

# Split the dataset into features and target
X = adult_df.drop('income', axis=1)
y = adult_df['income']

# Encode categorical variables
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Calculate feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_encoded, y)
importances = rf.feature_importances_

# Calculate cumulative importances
cumulative_importances = np.cumsum(importances)

# Find number of features for cumulative importance of 95%
num_features = np.where(cumulative_importances > 0.95)[0][0] + 1

# Extract the names of the most important features
feature_names = list(X.columns)
important_feature_names = [feature_names[i] for i in np.argsort(importances)[-num_features:]]

# Extract the most important features
important_features = X_encoded[:, np.argsort(importances)[-num_features:]]

In [42]:
# Extract the most important features
important_features = X_encoded[:, np.argsort(importances)[-num_features:]]

# Handle the skewed nature of the data
positive_class = np.where(y == '>50K')[0]
negative_class = np.where(y == '<=50K')[0]

# Oversample the minority class
negative_class = np.random.choice(negative_class, size=len(positive_class), replace=True)

# Combine the positive and negative classes
X_train = np.concatenate([important_features[positive_class], important_features[negative_class]], axis=0)
y_train = np.concatenate([y[positive_class], y[negative_class]], axis=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model using the most important features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Calculate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Print the accuracy score and the classification report
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8536818616512591
Classification Report:
 {'<=50K': {'precision': 0.8639014906027219, 'recall': 0.8426042983565107, 'f1-score': 0.85312, 'support': 1582.0}, '>50K': {'precision': 0.8437892095357591, 'recall': 0.864951768488746, 'f1-score': 0.8542394410924102, 'support': 1555.0}, 'accuracy': 0.8536818616512591, 'macro avg': {'precision': 0.8538453500692404, 'recall': 0.8537780334226284, 'f1-score': 0.8536797205462051, 'support': 3137.0}, 'weighted avg': {'precision': 0.8539319027611131, 'recall': 0.8536818616512591, 'f1-score': 0.8536749030598334, 'support': 3137.0}}


In [46]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the new dataset
test_df = pd.read_csv("adult.test", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"], skipinitialspace=True)

# Split the new dataset into features and target
X_new = test_df.drop('income', axis=1)
y_new = test_df['income']

# Fit the OrdinalEncoder to the entire dataset, including the new data
encoder = OrdinalEncoder()
encoder.fit(pd.concat([X, X_new]))

# Transform the new data
X_new_encoded = encoder.transform(X_new)

# Extract the most important features for the new data
important_features_new = X_new_encoded[:, np.argsort(importances)[-num_features:]]

# Make predictions on the new data using the trained model
y_new_pred = rf.predict(important_features_new)

# Calculate the accuracy score and the classification report for the new data
new_accuracy = accuracy_score(y_new, y_new_pred)
new_report = classification_report(y_new, y_new_pred, output_dict=True, zero_division=0)

# Print the accuracy score and the classification report for the new data
print("New Data Accuracy:", new_accuracy)
print("New Data Classification Report:\n", new_report)

New Data Accuracy: 0.0
New Data Classification Report:
 {'<=50K': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0.0}, '<=50K.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12435.0}, '>50K': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0.0}, '>50K.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3846.0}, 'accuracy': 0.0, 'macro avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16281.0}, 'weighted avg': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16281.0}}


In [None]:
# Handle the skewed nature of the data
positive_class = np.where(y == '>50K')[0]
negative_class = np.where(y == '<=50K')[0]

# Oversample the minority class
negative_class = np.random.choice(negative_class, size=len(positive_class), replace=True)

# Combine the positive and negative classes
X_train = np.concatenate([important_features[positive_class], important_features[negative_class]], axis=0)
y_train = np.concatenate([y[positive_class], y[negative_class]], axis=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model using the most important features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Calculate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Print the accuracy score and the classification report
print("Accuracy:", accuracy)
print("Classification Report:\n", report)