In [6]:
###Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [21]:
###Reading data, partitioning and scaling.
df = pd.read_excel('/Users/souradeepchattopadhyay/Documents/neuripsbehaviorml/data1.xlsx')
data = df[~df['Unnamed: 0'].str.startswith('pp')]
y = data['RecommendHiring']
y1 = (y >= y.median()).astype(int)
X = data.iloc[:,[1,2,3,4,5]]
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state = 2)
scaler_x = StandardScaler()
X_scaled = scaler_x.fit_transform(X)
X_train_scaled = scaler_x.fit_transform(X_train)
X_test_scaled = scaler_x.transform(X_test)

In [22]:
##Fitting random forest with hyperparameter tuning.
rf = RandomForestClassifier(random_state=2)
# Define the hyperparameter grid to tune
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4]
    #'max_features': ['sqrt', 'log2']
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose = 0)
grid_search.fit(X_train_scaled, y_train)
best_rf = grid_search.best_estimator_
y_train_pred = best_rf.predict(X_train_scaled)
training_accuracy = accuracy_score(y_train, y_train_pred)
y_test_pred = best_rf.predict(X_test_scaled)
cv_scores = cross_val_score(best_rf, X_scaled, y1, cv=5)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean():.2f}")
print(f"SD of cross-validation accuracy: {cv_scores.std():.2f}")
print(f"Training set accuracy: {training_accuracy:.2f}")
print(f"Test set accuracy: {accuracy:.2f}")

Cross-validation scores: [0.57142857 0.57142857 0.64285714 0.85714286 0.53846154]
Mean cross-validation accuracy: 0.64
SD of cross-validation accuracy: 0.12
Training set accuracy: 0.69
Test set accuracy: 0.71


In [23]:
#Calculating feature importances based on random forest.
importances = best_rf.feature_importances_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
importance_df

Unnamed: 0,Feature,Importance
3,Agreeableness,0.345568
2,Extraversion,0.251476
0,Openness to Experience,0.220073
1,Conscientiousness,0.162355
4,Neuroticism,0.020528
