In [1]:
# required models and modules
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
adult_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"], skipinitialspace=True)

# Split the dataset into features and target
X = adult_df.drop('income', axis=1)
y = adult_df['income']

# Encode categorical variables
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Calculate feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_encoded, y)
importances = rf.feature_importances_

# Calculate cumulative importances
cumulative_importances = np.cumsum(importances)

# Find number of features for cumulative importance of 95%
num_features = np.where(cumulative_importances > 0.95)[0][0] + 1

# Extract the names of the most important features
feature_names = list(X.columns)
important_feature_names = [feature_names[i] for i in np.argsort(importances)[-num_features:]]

# Extract the most important features
important_features = X_encoded[:, np.argsort(importances)[-num_features:]]

In [3]:
# Extract the most important features
important_features = X_encoded[:, np.argsort(importances)[-num_features:]]

# Handle the skewed nature of the data by over sampeling on class to match the training points for both the classes, This will help the model pick up the patterns better for both the models.
positive_class = np.where(y == '>50K')[0]
negative_class = np.where(y == '<=50K')[0]

# Oversample the minority class
negative_class = np.random.choice(negative_class, size=len(positive_class), replace=True)

# Combine the positive and negative classes
X_train = np.concatenate([important_features[positive_class], important_features[negative_class]], axis=0)
y_train = np.concatenate([y[positive_class], y[negative_class]], axis=0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model using the most important features
rf = RandomForestClassifier(n_estimators=5000, random_state=84)
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Calculate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Print the accuracy score and the classification report
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8460312400382531
Classification Report:
 {'<=50K': {'precision': 0.8506700701978303, 'recall': 0.8426042983565107, 'f1-score': 0.8466179739599873, 'support': 1582.0}, '>50K': {'precision': 0.8414012738853504, 'recall': 0.8495176848874598, 'f1-score': 0.84544, 'support': 1555.0}, 'accuracy': 0.8460312400382531, 'macro avg': {'precision': 0.8460356720415902, 'recall': 0.8460609916219852, 'f1-score': 0.8460289869799936, 'support': 3137.0}, 'weighted avg': {'precision': 0.8460755600716249, 'recall': 0.8460312400382531, 'f1-score': 0.8460340563610775, 'support': 3137.0}}


In [4]:
# testing the performance of the new model on unknown dataset

from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the new dataset
test_df = pd.read_csv("adult.test", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"], skipinitialspace=True)
test_df.drop("race", axis=1, inplace=True)

# Split the new dataset into features and target
X_new = test_df.drop('income', axis=1)
y_new = test_df['income']
y_new = y_new.str.replace('.', '')

encoder_new = OrdinalEncoder()
input_data_encoded = encoder_new.fit_transform(X_new)
# Make a prediction using the trained model
prediction = rf.predict(input_data_encoded)

print(accuracy_score(y_new, prediction))
print(classification_report(y_new, prediction, output_dict=True))

0.7639579878385848
{'<=50K': {'precision': 0.764141662567634, 'recall': 0.999437072778448, 'f1-score': 0.8660928952228301, 'support': 12435.0}, '>50K': {'precision': 0.5882352941176471, 'recall': 0.0026001040041601664, 'f1-score': 0.0051773233238415735, 'support': 3846.0}, 'accuracy': 0.7639579878385848, 'macro avg': {'precision': 0.6761884783426406, 'recall': 0.5010185883913041, 'f1-score': 0.4356351092733358, 'support': 16281.0}, 'weighted avg': {'precision': 0.7225879562192126, 'recall': 0.7639579878385848, 'f1-score': 0.6627220156992437, 'support': 16281.0}}


In [5]:
"""
Given the fianl testing results I am looking at a reduced performance of ~-8%.

The model is overfitted in some capacity as clearly it has a low precision of ~59% when it come to detecting the ">50K" class and that degradation is high when it come to detecting the <=50K.

To fix this I applied increased over sampling of the ">50K" class to match it against the "<=50K" class, and while this did result in improved performance it was less than anticipated. I am still looking better ways to train models on skewed datasets.

For both the classes I was trying to find rules in this dataset but came up empty handed. 
"""