Step 1: Clean up the dataset

In [11]:
# Load the datasets 
import pandas as pd

# The column names are not included in the files, so we use the description from the assignment
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]

# Load the training data
train_data_path = "/home/szende/Spring-2024/Data-Mining/Assignment2/adult.data"
train_data = pd.read_csv(train_data_path, names=columns, na_values="?", skipinitialspace=True)

# Load the test data
test_data_path = "/home/szende/Spring-2024/Data-Mining/Assignment2/adult.test"
# Skip the first line as it's not part of the data (based on common structure of this dataset)
test_data = pd.read_csv(test_data_path, names=columns, na_values="?", skipinitialspace=True, skiprows=1)

# Display the first few rows of each dataset and count missing values
train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

(train_missing, test_missing, train_data.head(), test_data.head())


(age                  0
 workclass         1836
 fnlwgt               0
 education            0
 education_num        0
 marital_status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital_gain         0
 capital_loss         0
 hours_per_week       0
 native_country     583
 income               0
 dtype: int64,
 age                 0
 workclass         963
 fnlwgt              0
 education           0
 education_num       0
 marital_status      0
 occupation        966
 relationship        0
 race                0
 sex                 0
 capital_gain        0
 capital_loss        0
 hours_per_week      0
 native_country    274
 income              0
 dtype: int64,
    age         workclass  fnlwgt  education  education_num  \
 0   39         State-gov   77516  Bachelors             13   
 1   50  Self-emp-not-inc   83311  Bachelors             13   
 2   38           Private  215646    HS-grad              9   
 3   53      

After the data cleaning process

In [12]:
# Impute missing values with mode for categorical columns
for column in ['workclass', 'occupation', 'native_country']:
    train_mode = train_data[column].mode()[0]
    test_mode = test_data[column].mode()[0]

    train_data[column].fillna(train_mode, inplace=True)
    test_data[column].fillna(test_mode, inplace=True)

# Verify the cleaning by checking the number of missing values again
train_missing_after = train_data.isnull().sum()
test_missing_after = test_data.isnull().sum()

(train_missing_after, test_missing_after)


(age               0
 workclass         0
 fnlwgt            0
 education         0
 education_num     0
 marital_status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital_gain      0
 capital_loss      0
 hours_per_week    0
 native_country    0
 income            0
 dtype: int64,
 age               0
 workclass         0
 fnlwgt            0
 education         0
 education_num     0
 marital_status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital_gain      0
 capital_loss      0
 hours_per_week    0
 native_country    0
 income            0
 dtype: int64)

Step 2: Implement a Classification Method (Random Forest)

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# train_data and test_data are already loaded and cleaned as per the previous steps

# Define the categorical and continuous features
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
continuous_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Separate the features and the target variable
X_train_continuous = train_data[continuous_features]
X_train_categorical = train_data[categorical_features]
X_test_continuous = test_data[continuous_features]
X_test_categorical = test_data[categorical_features]

# Encode the target variable
y_train = (train_data['income'].apply(lambda x: ">50K" in x)).astype(int)
y_test = (test_data['income'].apply(lambda x: ">50K." in x)).astype(int)

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(X_train_categorical)

# Transform the categorical features
X_train_categorical_encoded = encoder.transform(X_train_categorical)
X_test_categorical_encoded = encoder.transform(X_test_categorical)

# Combine continuous and categorical features
X_train_prepared = np.hstack((X_train_continuous, X_train_categorical_encoded))
X_test_prepared = np.hstack((X_test_continuous, X_test_categorical_encoded))

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_prepared, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test_prepared)

# Calculate the error rate
error_rate = 1 - accuracy_score(y_test, y_pred)

print(f"Classification error rate (as a decimal):  {error_rate}")

# Calculate the number of correct and incorrect predictions
num_correct_predictions = accuracy_score(y_test, y_pred, normalize=False)
num_incorrect_predictions = len(y_test) - num_correct_predictions

print(f"Classification error rate (percentage): {error_rate * 100:.2f}%")
print(f"Number of correct predictions: {num_correct_predictions}")
print(f"Number of incorrect predictions: {num_incorrect_predictions}")
print(f"Total number of instances in the test dataset: {len(y_test)}")

Classification error rate (as a decimal):  0.14882378232295312
Classification error rate (percentage): 14.88%
Number of correct predictions: 13858
Number of incorrect predictions: 2423
Total number of instances in the test dataset: 16281


Step 3: Classification Error Rate & Proposed Improvements

Step 3.2: Hyperparameter Tuning

In [14]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],  # Example: try with 100 and 200 trees
    'max_depth': [None, 10, 20],  # None means no limit; also limiting depth to prevent overfitting
    'max_features': ['sqrt', 'log2']  # Number of features to consider for each split
}

# Initialize GridSearchCV
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train_prepared, y_train)

# Best parameter set
print("Best parameters:", grid_search.best_params_)

# Train a new classifier using the best parameters
best_rf_classifier = grid_search.best_estimator_

# Predict on the test set with the tuned model
y_pred_best = best_rf_classifier.predict(X_test_prepared)

# Calculate the new error rate
new_error_rate = 1 - accuracy_score(y_test, y_pred_best)
print(f"New classification error rate (as a decimal): {new_error_rate}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   3.8s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   3.8s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   3.8s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   3.8s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   3.8s
[CV] END max_depth=None, max_features=log2, n_estimators=100; total time=   1.6s
[CV] END max_depth=None, max_features=log2, n_es

Step 4: Randomly downsampling and Re-training the dataset

In [15]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# X_train_prepared, y_train, and X_test_prepared are correctly set up as before
# Initialize variables to store results
sampling_rates = [50, 60, 70, 80, 90]
error_rates = {rate: [] for rate in sampling_rates}

# Loop over each sampling rate
for rate in sampling_rates:
    for iteration in range(5):  # Repeat the process 5 times for each rate
        # Create StratifiedShuffleSplit instance with different random states
        sss = StratifiedShuffleSplit(n_splits=1, test_size=1-(rate/100), random_state=42+iteration)
        
        for train_index, _ in sss.split(X_train_prepared, y_train):
            # Generate the down-sampled dataset
            X_train_downsampled, y_train_downsampled = X_train_prepared[train_index], y_train[train_index]
            
            # Initialize and train the Random Forest classifier on the down-sampled dataset
            rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=20, max_features='sqrt', random_state=42)
            rf_classifier.fit(X_train_downsampled, y_train_downsampled)
            
            # Predict on the test set
            y_pred = rf_classifier.predict(X_test_prepared)
            
            # Calculate and record the error rate
            error_rate = 1 - accuracy_score(y_test, y_pred)
            error_rates[rate].append(error_rate)

# Calculate the mean and standard deviation of the error rates for each sampling rate
mean_error_rates = {rate: np.mean(errors) for rate, errors in error_rates.items()}
std_dev_error_rates = {rate: np.std(errors) for rate, errors in error_rates.items()}

for rate in sampling_rates:
    print(f"Sampling Rate: {rate}% - Mean Error Rate: {mean_error_rates[rate]:.4f}, Standard Deviation: {std_dev_error_rates[rate]:.4f}")


Sampling Rate: 50% - Mean Error Rate: 0.1390, Standard Deviation: 0.0014
Sampling Rate: 60% - Mean Error Rate: 0.1381, Standard Deviation: 0.0004
Sampling Rate: 70% - Mean Error Rate: 0.1378, Standard Deviation: 0.0004
Sampling Rate: 80% - Mean Error Rate: 0.1371, Standard Deviation: 0.0006
Sampling Rate: 90% - Mean Error Rate: 0.1365, Standard Deviation: 0.0004


Step 5: Proposed Single Classifier that outperform other classic classifiers (Decision Tree)

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Define the categorical and continuous features
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
continuous_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Encode the target variable
y_train = train_data['income'].apply(lambda x: 1 if ">50K" in x else 0)
y_test = test_data['income'].apply(lambda x: 1 if ">50K." in x else 0)

# Preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Hyperparameters distribution to sample from
param_distributions = {
    'classifier__max_depth': randint(3, 20),  
    'classifier__min_samples_split': randint(2, 20), 
    'classifier__min_samples_leaf': randint(1, 20), 
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_distributions,
    n_iter=100,  
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# X_train and X_test are already defined
# Fit the model
random_search.fit(train_data.drop('income', axis=1), y_train)

# Print the best parameters and the corresponding score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation accuracy: ", random_search.best_score_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(test_data.drop('income', axis=1))

# Calculate accuracy and error rate
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy

print(f"Optimized Decision Tree Test Accuracy: {accuracy:.4f}")
print(f"Optimized Decision Tree Test Error Rate: {error_rate:.4f}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END classifier__max_depth=9, classifier__min_samples_leaf=15, classifier__min_samples_split=12; total time=   0.2s
[CV] END classifier__max_depth=9, classifier__min_samples_leaf=15, classifier__min_samples_split=12; total time=   0.2s
[CV] END classifier__max_depth=9, classifier__min_samples_leaf=15, classifier__min_samples_split=12; total time=   0.2s
[CV] END classifier__max_depth=9, classifier__min_samples_leaf=15, classifier__min_samples_split=12; total time=   0.2s
[CV] END classifier__max_depth=9, classifier__min_samples_leaf=15, classifier__min_samples_split=12; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=7, classifier__min_samples_split=12; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=7, classifier__min_samples_split=12; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=7, classifier__min_samples_split=