In [11]:
#initial imports
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [3]:
#read in file
dfLoans = pd.read_csv(Path("../Resources/loans_data_encoded.csv"))
dfLoans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


### Preprocess the Data
Now, we're going to walk through the preprocessing steps for the loan applications' encoded data so that we can fit our training and testing sets with the random forest model.

1. First, we define the features set.
2. Next, we define the target set. Here, we're using the `ravel() `method, which performs the same procedure on our target set data as the `values` attribute.
3. Now, we split into the training and testing sets.
4. Lastly, we can create the `StandardScaler` instance, fit the scaler with the training set, and scale the data.

In [9]:
#axis: index (0 or ‘index’) or columns (1 or ‘columns’)
X = dfLoans.copy().drop("bad", axis=1)
y = dfLoans["bad"].ravel()
y[:5]

array([0, 0, 0, 0, 0])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78)


In [15]:
#create instance of StandardScaler on training data
X_scaler = StandardScaler().fit(X_train)

#scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
#create random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78).fit(X_train_scaled, y_train)

In [23]:
#making predictions
predictions = rf_model.predict(X_test_scaled)


In [24]:
cm = confusion_matrix(y_test, predictions)

cmDF = pd.DataFrame(
    cm,
    index=["actual 0", "actual 1"],
    columns=["predicted 0", "predicted 1"]
)

In [25]:
display(cmDF)

Unnamed: 0,predicted 0,predicted 1
actual 0,50,34
actual 1,26,15


In [27]:
accScore = accuracy_score(y_test, predictions)
accScore

0.52

In [28]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.66      0.60      0.62        84
           1       0.31      0.37      0.33        41

    accuracy                           0.52       125
   macro avg       0.48      0.48      0.48       125
weighted avg       0.54      0.52      0.53       125



In [29]:
#return important features
imp = rf_model.feature_importances_
imp

array([0.05479564, 0.083734  , 0.42871538, 0.32290918, 0.01986699,
       0.02382607, 0.0025937 , 0.0238055 , 0.01947929, 0.02027425])

we can `zip()` a list of the  names of the columns (X.columns) with the weights in `imp` and then sort them with the `sorted()` function

In [30]:
#sort importances and zip them with the column names
sorted(zip(rf_model.feature_importances_,X.columns),reverse=True)

[(0.42871537959784756, 'age'),
 (0.322909180511852, 'month_num'),
 (0.08373400008559587, 'term'),
 (0.054795639837519644, 'amount'),
 (0.023826066130178958, 'education_High School or Below'),
 (0.023805501611235368, 'education_college'),
 (0.020274246776222464, 'gender_male'),
 (0.01986699331709372, 'education_Bachelor'),
 (0.019479294982300867, 'gender_female'),
 (0.0025936971501533874, 'education_Master or Above')]

one possible way to improve randomforest models is to drop the least important features as those tend to increase variance.  however, since this model was so insensitive and had such a low precision and f1 score, dropping them in this case doesn't improve the model