In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from itertools import product
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report

In [28]:
# (a) Load the given dataset.

data = pd.read_csv('adult.data')
data.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [29]:
#adding header names for better practices

attribute_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                   'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                   'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   'income'] #using the data definition from the UCI website

data.columns = attribute_names

print(data.head())

   age          workclass  fnlwgt   education  education-num   
0   50   Self-emp-not-inc   83311   Bachelors             13  \
1   38            Private  215646     HS-grad              9   
2   53            Private  234721        11th              7   
3   28            Private  338409   Bachelors             13   
4   37            Private  284582     Masters             14   

        marital-status          occupation    relationship    race      sex   
0   Married-civ-spouse     Exec-managerial         Husband   White     Male  \
1             Divorced   Handlers-cleaners   Not-in-family   White     Male   
2   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
3   Married-civ-spouse      Prof-specialty            Wife   Black   Female   
4   Married-civ-spouse     Exec-managerial            Wife   White   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0             0             0              13   United-States   <=50

In [30]:
# (b) Exhibit the first few rows of the dataset and show the count of instances and descriptive features in
# the original data.

num_instances, num_features = data.shape
print("\nNumber of instances:", num_instances)
print("Number of descriptive features:", num_features)


Number of instances: 32560
Number of descriptive features: 15


In [31]:
# (c) Eliminate instances containing missing values.

data.replace(' ?', np.nan, inplace=True) #adding space before question mark as all the values have a space before them.

# Drop rows with missing values
data.dropna(inplace=True)

# Display the updated instance count
updated_instance_count = len(data)
print("Updated instance count after removing rows with missing values:", updated_instance_count)

Updated instance count after removing rows with missing values: 30161


In [32]:
# (d) The class feature, INCOME, has two categorical values: Alter the target
# feature to binary 0/1, although it s generally not a requisite for the Gradient Boosting algorithm.

data['income'] = data['income'].map({' <=50K': 0, ' >50K': 1})
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [33]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# We know these are categorical data coloumns from the definition of dataset from the website: https://archive.ics.uci.edu/dataset/2/adult
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex' ,'native-country']

for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

# Encode the target variable 'income'
data['income'] = label_encoder.fit_transform(data['income'])

# Display the first few rows of the updated dataset
print("First few rows after Label Encoding:")
print(data.head())


First few rows after Label Encoding:
   age  workclass  fnlwgt  education  education-num  marital-status   
0   50          4   83311          9             13               2  \
1   38          2  215646         11              9               0   
2   53          2  234721          1              7               2   
3   28          2  338409          9             13               2   
4   37          2  284582         12             14               2   

   occupation  relationship  race  sex  capital-gain  capital-loss   
0           3             0     4    1             0             0  \
1           5             1     4    1             0             0   
2           5             0     2    1             0             0   
3           9             5     2    0             0             0   
4           3             5     4    0             0             0   

   hours-per-week  native-country  income  
0              13              38       0  
1              40          

In [34]:
# (g) Split the data for model training and testing, allocating 30% for testing and the remaining 70% for
# training.

x = data.drop('income', axis = 1) #extracting features
y = data['income'] #extracting target variable


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

print(f'{X_train.shape} {y_train.shape} {X_test.shape} {y_test.shape}')

(21112, 14) (21112,) (9049, 14) (9049,)


In [35]:
# Applying Gradient Boosting classifier

# Define the values for hyper-parameters
n_estimators_values = [5, 10, 50]  # Number of decision trees
learning_rate_values = [0.01, 0.05, 0.1]

# Initialize a dictionary to store the results
results = {}

# Iterate through all combinations of hyper-parameters
for n_estimators, learning_rate in product(n_estimators_values, learning_rate_values):
    # Initialize and train the Gradient Boosting classifier
    gb_classifier = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
    gb_classifier.fit(X_train, y_train)

    # Predict on the test set
    y_pred = gb_classifier.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store the accuracy for this combination of hyper-parameters
    results[(n_estimators, learning_rate)] = accuracy

# Print the results
for params, accuracy in results.items():
    print(f"Hyperparameters: n_estimators={params[0]}, learning_rate={params[1]}, Accuracy={accuracy:.4f}")


Hyperparameters: n_estimators=5, learning_rate=0.01, Accuracy=0.7538
Hyperparameters: n_estimators=5, learning_rate=0.05, Accuracy=0.7538
Hyperparameters: n_estimators=5, learning_rate=0.1, Accuracy=0.7993
Hyperparameters: n_estimators=10, learning_rate=0.01, Accuracy=0.7538
Hyperparameters: n_estimators=10, learning_rate=0.05, Accuracy=0.7990
Hyperparameters: n_estimators=10, learning_rate=0.1, Accuracy=0.8343
Hyperparameters: n_estimators=50, learning_rate=0.01, Accuracy=0.7990
Hyperparameters: n_estimators=50, learning_rate=0.05, Accuracy=0.8494
Hyperparameters: n_estimators=50, learning_rate=0.1, Accuracy=0.8583


In [36]:
# (h) performing grid search

# Define the values for hyper-parameters
param_grid = {
    'n_estimators': [5, 10, 50],  # Number of decision trees
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Initialize the GridSearchCV
grid_search = GridSearchCV(gb_classifier, param_grid, cv=3, scoring=make_scorer(accuracy_score))

# Perform the grid search using the training data
grid_search.fit(X_train, y_train)

# Print the best hyper-parameters and the corresponding accuracy
print("Best Hyperparameters:")
print(grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Best Hyperparameters:
{'learning_rate': 0.1, 'n_estimators': 50}
Best Accuracy: 0.8546324291463158


In [37]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
y_test_pred = best_model.predict(X_test)

# Calculate accuracy on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the accuracy on the test data
print("Accuracy on the test data:", test_accuracy)


Accuracy on the test data: 0.8583268869488341


In [38]:

# Get the results of the grid search
results_df = pd.DataFrame(grid_search.cv_results_)

# Display relevant information for each parameter combination
print("Parameter combinations and corresponding scores:")
for i in range(len(results_df)):
    params = results_df.loc[i, 'params']
    mean_test_score = results_df.loc[i, 'mean_test_score']
    std_test_score = results_df.loc[i, 'std_test_score']
    rank_test_score = results_df.loc[i, 'rank_test_score']
    
    print(f"Parameters: {params}, Mean Test Score: {mean_test_score:.4f}, "
          f"Standard Deviation of Test Scores: {std_test_score:.4f}, "
          f"Rank Test Score: {rank_test_score}")


Parameter combinations and corresponding scores:
Parameters: {'learning_rate': 0.01, 'n_estimators': 5}, Mean Test Score: 0.7499, Standard Deviation of Test Scores: 0.0000, Rank Test Score: 7
Parameters: {'learning_rate': 0.01, 'n_estimators': 10}, Mean Test Score: 0.7499, Standard Deviation of Test Scores: 0.0000, Rank Test Score: 7
Parameters: {'learning_rate': 0.01, 'n_estimators': 50}, Mean Test Score: 0.7957, Standard Deviation of Test Scores: 0.0010, Rank Test Score: 6
Parameters: {'learning_rate': 0.05, 'n_estimators': 5}, Mean Test Score: 0.7499, Standard Deviation of Test Scores: 0.0000, Rank Test Score: 7
Parameters: {'learning_rate': 0.05, 'n_estimators': 10}, Mean Test Score: 0.7957, Standard Deviation of Test Scores: 0.0008, Rank Test Score: 5
Parameters: {'learning_rate': 0.05, 'n_estimators': 50}, Mean Test Score: 0.8453, Standard Deviation of Test Scores: 0.0030, Rank Test Score: 2
Parameters: {'learning_rate': 0.1, 'n_estimators': 5}, Mean Test Score: 0.7958, Standard 

In [39]:
# (j) Present the performance report of the model with the superior parameter setting, incorporating
# metrics such as accuracy, precision, recall, F1-score, etc.

y_test_pred = best_model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_test_pred, target_names=['class 0', 'class 1'])

# Print the classification report
print("Performance Report for the Best Model:")
print(report)


Performance Report for the Best Model:
              precision    recall  f1-score   support

     class 0       0.87      0.95      0.91      6821
     class 1       0.79      0.58      0.67      2228

    accuracy                           0.86      9049
   macro avg       0.83      0.76      0.79      9049
weighted avg       0.85      0.86      0.85      9049

