In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [2]:
df = pd.read_csv('df1.csv')
df.head()

Unnamed: 0,age,fnlwgt,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income_>50K
0,25,226802,40,False,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,38,89814,50,False,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,28,336951,40,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,44,160323,40,False,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,18,103497,30,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [3]:
# Determining the target variable and features, and scaling the features
X = df.drop('income_>50K', axis=1)
y = df['income_>50K']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## The Low Performance Models:

### 1. Decision Tree:

In [5]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Train the Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predictions
y_pred = decision_tree.predict(X_test)
y_pred_proba = decision_tree.predict_proba(X_test)[:, 1]

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba)
cv_scores = cross_val_score(decision_tree, X, y, cv=5, scoring='accuracy')

print(f"Decision Tree - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}, ROC-AUC Score: {roc_auc:.2f}, Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

Decision Tree - Accuracy: 0.77, Precision: 0.77, Recall: 0.77, F1-score: 0.77, ROC-AUC Score: 0.69, Cross-Validation Accuracy: 0.78 ± 0.00


In [6]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for Decision Tree:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)


Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Best cross-validation accuracy: 0.8253739614497471


### 2. Support Vector Machine (SVM):

In [None]:
from sklearn.svm import SVC

# Support Vector Machine (SVM)
svm = SVC(probability=True)  # Add probability=True to enable probability estimates for ROC-AUC
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cv_scores = cross_val_score(svm, X, y, cv=5, scoring='accuracy')

print(f"Support Vector Machine (SVM) - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}, Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

In [None]:
from sklearn.svm import SVC

# Reduced Parameter Grid
param_dist = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Randomized Search with Reduced Grid and 3-Fold CV
random_search = RandomizedSearchCV(
    SVC(probability=True),
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings that are sampled
    cv=3,  # Use 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    random_state=42,
    verbose=2
)

random_search.fit(X_train, y_train)

print("Best cross-validation accuracy:", random_search.best_score_)


### 4. Neural Networks (Deep Learning):

In [4]:
from sklearn.neural_network import MLPClassifier

# Neural Networks (Deep Learning)
neural_network = MLPClassifier(max_iter=1000)
neural_network.fit(X_train, y_train)

# Evaluate the model
y_pred = neural_network.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cv_scores = cross_val_score(neural_network, X, y, cv=5, scoring='accuracy')

print(f"Neural Networks (Deep Learning) - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}, Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

Neural Networks (Deep Learning) - Accuracy: 0.82, Precision: 0.81, Recall: 0.82, F1-score: 0.81, Cross-Validation Accuracy: 0.66 ± 0.20


In [None]:
from sklearn.neural_network import MLPClassifier
# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

# Randomized Search
random_search = RandomizedSearchCV(
    MLPClassifier(max_iter=1000),
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings that are sampled
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    random_state=42,
    verbose=2
)

random_search.fit(X_train, y_train)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Neural Networks:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.2f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.2f}")
print(f"F1-score: {f1_score(y_test, y_pred, average='weighted'):.2f}")
print("Best cross-validation accuracy:", random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


