In [None]:
# 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# 2. Load the Dataset
file_path = r'C:\Users\ewcub\Downloads\adult\adult.data'
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                'hours-per-week', 'native-country', 'income']
data = pd.read_csv(file_path, header=None, names=column_names, skipinitialspace=True)

# 3. Create New Features
data['net_capital'] = data['capital-gain'] - data['capital-loss']

# Interaction Terms
data['age_education_interaction'] = data['age'] * data['education-num']

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
age_poly = poly.fit_transform(data[['age']])
data = data.join(pd.DataFrame(age_poly, columns=[f'age_poly_{i}' for i in range(age_poly.shape[1])]))

# Binning Numerical Variables
data['age_bin'] = pd.cut(data['age'], bins=[0, 20, 40, 60, 80, 100], labels=['0-20', '21-40', '41-60', '61-80', '81-100'])
data['hours_per_week_bin'] = pd.cut(data['hours-per-week'], bins=[0, 20, 40, 60, 80, 100], labels=['0-20', '21-40', '41-60', '61-80', '81-100'])

# 4. Preprocessing: Encode Categorical Variables (Including New Bins)
data_encoded = pd.get_dummies(data, columns=[
    'workclass', 'education', 'marital-status', 'occupation', 'relationship', 
    'race', 'sex', 'native-country', 'age_bin', 'hours_per_week_bin'
])


# 5. Define Features (X) and Target Variable (y)
X = data_encoded.drop('income', axis=1)
y = data_encoded['income']

# 6. Split the Dataset into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Feature Scaling
scaler = StandardScaler()
numerical_features = [
    'age', 'fnlwgt', 'education-num', 'capital-gain', 
    'capital-loss', 'hours-per-week', 'net_capital'
]
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# 8. Feature Selection
feature_model = RandomForestClassifier(random_state=42)
feature_model.fit(X_train, y_train)
importances = feature_model.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importances})
selected_features = feature_importances[feature_importances['importance'] > 0.01]['feature']
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# 9. Initialize RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# 10. Progress Bar Setup for GridSearchCV
class TqdmCallback:
    def __init__(self, total):
        self.progress_bar = tqdm(total=total)
    def __call__(self, iteration, *args, **kwargs):
        self.progress_bar.n = iteration
        self.progress_bar.refresh()
    def close(self):
        self.progress_bar.close()

# 11. Initialize and Configure GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    clf, 
    param_grid, 
    cv=StratifiedKFold(n_splits=5), 
    scoring='accuracy', 
    verbose=0
)

# Fit the model
grid_search.fit(X_train_selected, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_clf = grid_search.best_estimator_

# 13. Get the Best Parameters
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# 14. Make Predictions on the Test Set Using the Best Model
y_pred = best_clf.predict(X_test_selected)

# 15. Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Fit the RandomForestClassifier
feature_model.fit(X_train, y_train)

# Extract feature importances
importances = feature_model.feature_importances_

# Visualize Feature Importances
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importances})
feature_importances.sort_values('importance', ascending=False, inplace=True)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importances.head(20))
plt.title('Top 20 important features')
plt.show()


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Make predictions
y_pred = best_clf.predict(X_test_selected)
y_pred_proba = best_clf.predict_proba(X_test_selected)[:, 1]

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ROC-AUC Score
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))


In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validated accuracy
cv_accuracy = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("Cross-Validated Accuracy Scores:", cv_accuracy)
print("Mean CV Accuracy:", cv_accuracy.mean())
