In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv("preprocessed.csv")

In [3]:
data.head()

Unnamed: 0,age,le_workclass,le_education,Education_Num,le_occupation,le_race,le_sex,Capital_Gain,Capital_Loss,Hours_per_Week,le_country,le_income
0,39,5,9,13,0,4,1,2174,0,40,38,0
1,50,4,9,13,3,4,1,0,0,13,38,0
2,38,2,11,9,5,4,1,0,0,40,38,0
3,53,2,1,7,5,2,1,0,0,40,38,0
4,28,2,9,13,9,2,0,0,0,40,4,0


In [4]:
features = data[["age", "le_workclass", "le_education", "Education_Num", "le_occupation", 
               "le_race", "le_sex", "Capital_Gain", "Capital_Loss", "Hours_per_Week", 
               "le_country"]]
target = data["le_income"]

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [6]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on tain set
y_pred = model.predict(X_test)

# Calculate prediction accuracy
prediction_accuracy = accuracy_score(y_test, y_pred)
print("Prediction Accuracy:", prediction_accuracy)

# Predict on training set
y_train_pred = model.predict(X_train)

# Calculate training accuracy
training_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", training_accuracy)


Prediction Accuracy: 0.8278606965174129
Training Accuracy: 0.9570998147994583


In [7]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Initialize Grid Search Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')

# Perform Grid Search Cross Validation
grid_search.fit(X_train, y_train)

# Get the best parameters found by Grid Search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Predict on the testing set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [8]:
# Calculate prediction accuracy
prediction_accuracy = accuracy_score(y_test, y_pred)
print("Prediction Accuracy:", prediction_accuracy)

# Calculate training accuracy
training_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", training_accuracy)

Prediction Accuracy: 0.8444444444444444
Training Accuracy: 0.9570998147994583


In [9]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90      6745
           1       0.79      0.53      0.64      2300

    accuracy                           0.84      9045
   macro avg       0.82      0.74      0.77      9045
weighted avg       0.84      0.84      0.83      9045

Confusion Matrix:
 [[6414  331]
 [1076 1224]]
