# 6. Feature Selection by Random Forest Regression

The feature importance obtained through the random forest model can help us screen out the most useful features, thereby simplifying the model, reducing overfitting, and improving computational efficiency.

## Part a: Data import
First, load the data from CSV files, and get data as features and label.

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/processed/merged_data_Y2.csv')
df_y3 = pd.read_csv('../data/processed/merged_data_Y3.csv')
df

Unnamed: 0,MemberID,AgeAtFirstClaim,Male,Female,Unknown,ClaimsCount,Unique_ProviderID,Unique_Vendor,Unique_PCP,PayDelay,...,3- 4 months,4- 5 months,5- 6 months,6- 7 months,7- 8 months,8- 9 months,9-10 months,DrugCount,LabCount,DaysInHospital
0,210,35.0,0.0,0.0,1.0,6.0,3.0,3.0,1.0,308.0,...,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0
1,3197,5.0,0.0,1.0,0.0,5.0,4.0,3.0,1.0,148.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0
2,11951,15.0,0.0,1.0,0.0,11.0,8.0,8.0,1.0,608.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1
3,14661,5.0,0.0,0.0,1.0,12.0,4.0,4.0,1.0,637.0,...,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,5.0,0
4,14778,35.0,0.0,1.0,0.0,8.0,6.0,5.0,2.0,320.0,...,5.0,0.0,0.0,1.0,0.0,0.0,0.0,13.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51961,99971678,65.0,0.0,1.0,0.0,16.0,5.0,4.0,1.0,553.0,...,3.0,1.0,0.0,0.0,3.0,0.0,0.0,10.0,13.0,0
51962,99973127,80.0,0.0,0.0,1.0,14.0,6.0,4.0,1.0,1081.0,...,1.0,0.0,0.0,4.0,0.0,0.0,1.0,22.0,5.0,0
51963,99976647,15.0,1.0,0.0,0.0,4.0,3.0,3.0,1.0,115.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,0
51964,99977491,65.0,0.0,1.0,0.0,6.0,2.0,2.0,1.0,235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,0.0,0


In [2]:
# Remove columns not used for prediction
X = df.drop(columns=['MemberID', 'DaysInHospital'])

# Use DaysInHospital as a label
y = df['DaysInHospital']
X

Unnamed: 0,AgeAtFirstClaim,Male,Female,Unknown,ClaimsCount,Unique_ProviderID,Unique_Vendor,Unique_PCP,PayDelay,Anesthesiology,...,2- 3 months,3- 4 months,4- 5 months,5- 6 months,6- 7 months,7- 8 months,8- 9 months,9-10 months,DrugCount,LabCount
0,35.0,0.0,0.0,1.0,6.0,3.0,3.0,1.0,308.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,5.0,0.0,1.0,0.0,5.0,4.0,3.0,1.0,148.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0
2,15.0,0.0,1.0,0.0,11.0,8.0,8.0,1.0,608.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,5.0,0.0,0.0,1.0,12.0,4.0,4.0,1.0,637.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,5.0
4,35.0,0.0,1.0,0.0,8.0,6.0,5.0,2.0,320.0,0.0,...,1.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,13.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51961,65.0,0.0,1.0,0.0,16.0,5.0,4.0,1.0,553.0,0.0,...,0.0,3.0,1.0,0.0,0.0,3.0,0.0,0.0,10.0,13.0
51962,80.0,0.0,0.0,1.0,14.0,6.0,4.0,1.0,1081.0,0.0,...,0.0,1.0,0.0,0.0,4.0,0.0,0.0,1.0,22.0,5.0
51963,15.0,1.0,0.0,0.0,4.0,3.0,3.0,1.0,115.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0
51964,65.0,0.0,1.0,0.0,6.0,2.0,2.0,1.0,235.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,0.0


## Part b: Training and evaluating the model before feature selection

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# Divide into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate model performance before feature selection
pre_selection_score = rf.score(X_test, y_test)
print(f"Coefficient of Determination (R²) before feature selection: {pre_selection_score}")

Coefficient of Determination (R²) before feature selection: 0.9159130267461997


## Part c: Feature selection based on feature importance

In [4]:
# Select features whose feature importance is higher than the average
selector = SelectFromModel(rf, threshold="mean")
selector.fit(X_train, y_train)

# Get selected features
selected_features = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_features]

# Print the selected feature names
print("Selected features:")
print(selected_feature_names)

# Extract selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

Selected features:
Index(['AgeAtFirstClaim', 'ClaimsCount', 'Unique_ProviderID', 'Unique_Vendor',
       'PayDelay', 'Diagnostic Imaging', 'Emergency', 'General Practice',
       'Internal', 'Surgery', 'Ambulance', 'Independent Lab',
       'Inpatient Hospital', 'Office', 'Urgent Care', 'MSC2a3', 'ROAMI',
       'TRAUMA', '1-2', 'EM', 'MED', 'PL', 'RAD', 'SDS', '0- 1 month',
       'DrugCount', 'LabCount'],
      dtype='object')


## Part d: Model after feature selection

In [5]:
# Retrain the model using the selected features
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(X_train_selected, y_train)

# Evaluate model performance after feature selection
post_selection_score = rf_selected.score(X_test_selected, y_test)
print(f"Coefficient of Determination (R²) after feature selection: {post_selection_score}")

Coefficient of Determination (R²) after feature selection: 0.9182220511833751


An increase in R² after removing features usually means that unnecessary, irrelevant, or negatively impacting features have been removed. In this way, the model complexity is reduced, overfitting is avoided, and the generalization ability of the model is improved, making it perform better on the test set. 

## Part e: Save data after feature selection

In [6]:
# apply feature selection on the year 2 data
rf_data_y2 = df[['AgeAtFirstClaim', 'ClaimsCount', 'Unique_ProviderID', 'Unique_Vendor',
       'PayDelay', 'Diagnostic Imaging', 'Emergency', 'General Practice',
       'Internal', 'Surgery', 'Ambulance', 'Independent Lab',
       'Inpatient Hospital', 'Office', 'Urgent Care', 'MSC2a3', 'ROAMI',
       'TRAUMA', '1-2', 'EM', 'MED', 'PL', 'RAD', 'SDS', '0- 1 month',
       'DrugCount', 'LabCount', 'DaysInHospital']]

# store the data
rf_data_y2.to_csv('../data/train_test/rf_data_Y2.csv', index=False)
rf_data_y2

Unnamed: 0,AgeAtFirstClaim,ClaimsCount,Unique_ProviderID,Unique_Vendor,PayDelay,Diagnostic Imaging,Emergency,General Practice,Internal,Surgery,...,1-2,EM,MED,PL,RAD,SDS,0- 1 month,DrugCount,LabCount,DaysInHospital
0,35.0,6.0,3.0,3.0,308.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,4.0,1.0,0.0,0.0,2.0,0.0,1.0,0
1,5.0,5.0,4.0,3.0,148.0,1.0,0.0,0.0,1.0,0.0,...,0.0,2.0,0.0,1.0,1.0,0.0,1.0,3.0,2.0,0
2,15.0,11.0,8.0,8.0,608.0,3.0,3.0,0.0,1.0,1.0,...,0.0,3.0,0.0,1.0,4.0,1.0,7.0,0.0,3.0,1
3,5.0,12.0,4.0,4.0,637.0,1.0,0.0,4.0,0.0,0.0,...,0.0,3.0,3.0,3.0,1.0,0.0,3.0,0.0,5.0,0
4,35.0,8.0,6.0,5.0,320.0,1.0,1.0,3.0,2.0,0.0,...,0.0,6.0,0.0,1.0,1.0,0.0,1.0,13.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51961,65.0,16.0,5.0,4.0,553.0,0.0,0.0,8.0,1.0,0.0,...,0.0,7.0,3.0,5.0,0.0,0.0,8.0,10.0,13.0,0
51962,80.0,14.0,6.0,4.0,1081.0,1.0,0.0,8.0,3.0,0.0,...,10.0,5.0,2.0,3.0,1.0,0.0,3.0,22.0,5.0,0
51963,15.0,4.0,3.0,3.0,115.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,1.0,5.0,3.0,0
51964,65.0,6.0,2.0,2.0,235.0,0.0,0.0,0.0,6.0,0.0,...,3.0,4.0,0.0,0.0,0.0,0.0,5.0,27.0,0.0,0


In [7]:
# apply feature selection on the year 3 data
rf_data_y3 = df_y3[['AgeAtFirstClaim', 'ClaimsCount', 'Unique_ProviderID', 'Unique_Vendor',
       'PayDelay', 'Diagnostic Imaging', 'Emergency', 'General Practice',
       'Internal', 'Surgery', 'Ambulance', 'Independent Lab',
       'Inpatient Hospital', 'Office', 'Urgent Care', 'MSC2a3', 'ROAMI',
       'TRAUMA', '1-2', 'EM', 'MED', 'PL', 'RAD', 'SDS', '0- 1 month',
       'DrugCount', 'LabCount', 'DaysInHospital']]

# store the data
rf_data_y3.to_csv('../data/train_test/rf_data_Y3.csv', index=False)
rf_data_y3

Unnamed: 0,AgeAtFirstClaim,ClaimsCount,Unique_ProviderID,Unique_Vendor,PayDelay,Diagnostic Imaging,Emergency,General Practice,Internal,Surgery,...,1-2,EM,MED,PL,RAD,SDS,0- 1 month,DrugCount,LabCount,DaysInHospital
0,35.0,4.0,2.0,2.0,143.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,1.0,0.0,0.0,1.0,5.0,1.0,0
1,5.0,11.0,3.0,1.0,379.0,0.0,0.0,0.0,9.0,0.0,...,11.0,8.0,3.0,0.0,0.0,0.0,5.0,6.0,0.0,0
2,5.0,13.0,4.0,3.0,563.0,2.0,0.0,2.0,1.0,0.0,...,2.0,6.0,2.0,1.0,4.0,0.0,5.0,10.0,1.0,0
3,15.0,8.0,7.0,7.0,330.0,1.0,3.0,0.0,1.0,1.0,...,0.0,2.0,0.0,2.0,1.0,1.0,8.0,2.0,2.0,2
4,5.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49677,15.0,2.0,1.0,1.0,62.0,0.0,2.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0
49678,55.0,2.0,1.0,1.0,35.0,0.0,0.0,0.0,2.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,1.0,22.0,0.0,0
49679,0.0,18.0,6.0,5.0,561.0,2.0,0.0,0.0,14.0,2.0,...,18.0,9.0,4.0,0.0,3.0,2.0,3.0,35.0,0.0,0
49680,25.0,8.0,3.0,3.0,0.0,1.0,0.0,0.0,3.0,0.0,...,0.0,3.0,0.0,2.0,1.0,0.0,8.0,0.0,10.0,0
