### Portfolio3-Random Forest-Part1-Feature Importance
#### Qiaoyu Wang

This week we explored Ensemble Models, such as Random Forest, which luckily aligned with the questions I posed in Portfolio2 last week. I've been contemplating how to naturally handle feature importance and interactions between features, as well as how can we use simple decision trees to handle larger datasets more effectively. I guess Ensemble Models might give me some insights.

So I divided the portfolio into 2 parts, and in this part, I continued using the same dataset (https://www.kaggle.com/datasets/nelgiriyewithana/apple-quality/data) to delve deeper into the first question, seeking a better understanding of feature importance.

#### 0 Import

In [1]:
#Import libraries 
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
pd.set_option("display.max_rows",1000)
pd.set_option("display.max_columns",1000)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

#### 1 Load in the dataset

In [2]:
df = pd.read_csv("data/apple_quality.csv")
df = df.dropna()
df

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good
...,...,...,...,...,...,...,...,...,...
3995,3995.0,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784369,bad
3996,3996.0,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235285,good
3997,3997.0,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611391,bad
3998,3998.0,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229719806,good


In [3]:
encoded_y = LabelEncoder().fit_transform(df["Quality"])
print(encoded_y)
print(len(encoded_y), len(df["Quality"]))
print(np.unique(encoded_y))

[1 1 0 ... 0 1 1]
4000 4000
[0 1]


#### 2 Train Random Forest Model

In [4]:
def get_dataset(features):
    #Filter just features
    df_features = df[features].copy()
    X = df_features.values.copy()
    #Filter genres and add label column
    df_features.loc[df['Quality'] == "good", "label"] = 0
    df_features.loc[df['Quality'] == "bad", "label"] = 1
    
    X = df_features.drop(columns=['label']).values  # Features matrix
    y = df_features['label'].values  # Target labels

    return X, y

def train(dataset, rf = True):
    X, y = dataset
    # Fit the model
    if rf:
        model = RandomForestClassifier(oob_score=True, random_state=42, n_estimators=500, n_jobs=-1)
        model.fit(X, y)
        print("OOB accuracy", model.oob_score_)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = AdaBoostClassifier(n_estimators=500, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("accuracy",accuracy)

In [5]:
features = ["Size", "Weight"]
train(get_dataset(features))
features = ["Size", "Weight" ,"Sweetness" ,"Crunchiness" ,"Ripeness", "Juiciness", "Acidity"]
train(get_dataset(features))

OOB accuracy 0.58625
OOB accuracy 0.88925


#### 3 Explore Feature Importance

##### 3.1 For-loop of all possible feature sets

In [6]:
from itertools import combinations
import random

In [7]:
all_feature_combinations = list(combinations(features, 2))
# Shuffle the combinations if you want random order
random.shuffle(all_feature_combinations)

# Print all possible combinations
for i, combination in enumerate(all_feature_combinations, 1):
    print(f"Combination {i}: {combination}")
    train(get_dataset(list(combination)))

Combination 1: ('Crunchiness', 'Juiciness')
OOB accuracy 0.62425
Combination 2: ('Weight', 'Acidity')
OOB accuracy 0.5905
Combination 3: ('Crunchiness', 'Ripeness')
OOB accuracy 0.596
Combination 4: ('Size', 'Juiciness')
OOB accuracy 0.6095
Combination 5: ('Sweetness', 'Ripeness')
OOB accuracy 0.63375
Combination 6: ('Weight', 'Juiciness')
OOB accuracy 0.5885
Combination 7: ('Size', 'Weight')
OOB accuracy 0.58625
Combination 8: ('Sweetness', 'Crunchiness')
OOB accuracy 0.58625
Combination 9: ('Ripeness', 'Acidity')
OOB accuracy 0.5695
Combination 10: ('Sweetness', 'Juiciness')
OOB accuracy 0.61175
Combination 11: ('Size', 'Ripeness')
OOB accuracy 0.63725
Combination 12: ('Sweetness', 'Acidity')
OOB accuracy 0.55225
Combination 13: ('Weight', 'Ripeness')
OOB accuracy 0.597
Combination 14: ('Crunchiness', 'Acidity')
OOB accuracy 0.5535
Combination 15: ('Size', 'Crunchiness')
OOB accuracy 0.5915
Combination 16: ('Size', 'Acidity')
OOB accuracy 0.597
Combination 17: ('Ripeness', 'Juiciness

In [8]:
all_feature_combinations = list(combinations(features, 3))
# Shuffle the combinations if you want random order
random.shuffle(all_feature_combinations)

# Print all possible combinations
for i, combination in enumerate(all_feature_combinations, 1):
    print(f"Combination {i}: {combination}")
    train(get_dataset(list(combination)))

Combination 1: ('Weight', 'Crunchiness', 'Acidity')
OOB accuracy 0.638
Combination 2: ('Size', 'Ripeness', 'Acidity')
OOB accuracy 0.71675
Combination 3: ('Size', 'Weight', 'Acidity')
OOB accuracy 0.66775
Combination 4: ('Sweetness', 'Crunchiness', 'Juiciness')
OOB accuracy 0.67825
Combination 5: ('Sweetness', 'Ripeness', 'Juiciness')
OOB accuracy 0.73175
Combination 6: ('Size', 'Weight', 'Sweetness')
OOB accuracy 0.71375
Combination 7: ('Weight', 'Sweetness', 'Juiciness')
OOB accuracy 0.673
Combination 8: ('Weight', 'Sweetness', 'Crunchiness')
OOB accuracy 0.65625
Combination 9: ('Size', 'Crunchiness', 'Juiciness')
OOB accuracy 0.69975
Combination 10: ('Sweetness', 'Ripeness', 'Acidity')
OOB accuracy 0.67075
Combination 11: ('Crunchiness', 'Juiciness', 'Acidity')
OOB accuracy 0.661
Combination 12: ('Size', 'Sweetness', 'Ripeness')
OOB accuracy 0.74875
Combination 13: ('Ripeness', 'Juiciness', 'Acidity')
OOB accuracy 0.702
Combination 14: ('Sweetness', 'Crunchiness', 'Ripeness')
OOB ac

##### 3.2 Gini Importance

In [9]:
features = ["Size", "Weight", "Sweetness", "Crunchiness", "Ripeness", "Juiciness", "Acidity"]
dataset = get_dataset(features)
train(dataset)

X, y = dataset
model = RandomForestClassifier(oob_score=True, random_state=42, n_estimators=500, n_jobs=-1)
model.fit(X, y)

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X.shape[1]):
    print(f"{features[indices[f]]} :{importances[indices[f]]}")

OOB accuracy 0.88925
Ripeness :0.16958378610872357
Size :0.1647901723267969
Juiciness :0.15912929288349842
Sweetness :0.1512183408774938
Acidity :0.13070760258812117
Weight :0.11513197367550825
Crunchiness :0.10943883153985788


##### 3.3 Permutation Importance

In [10]:
from sklearn.inspection import permutation_importance

In [11]:
features = ["Size", "Weight", "Sweetness", "Crunchiness", "Ripeness", "Juiciness", "Acidity"]
dataset = get_dataset(features)
X, y = dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(oob_score=True, random_state=42, n_estimators=500, n_jobs=-1)
model.fit(X_train, y_train)

perm_importance = permutation_importance(model, X_test, y_test)

for f in perm_importance.importances_mean.argsort()[::-1]:
    print(f"{features[f]}: {perm_importance.importances_mean[f]}")

Size: 0.14375000000000002
Ripeness: 0.1385
Juiciness: 0.12650000000000003
Sweetness: 0.09325000000000001
Acidity: 0.06350000000000004
Crunchiness: 0.05425000000000004
Weight: 0.04675000000000003
