In [1]:
from sklearn import metrics, datasets
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split

In [4]:
iris = datasets.load_iris()

train_x, test_x ,train_y, test_y = train_test_split(iris.data,iris.target, test_size = 0.25, random_state = 2019)

#20 decision trees and each tree has maximum depth limit of 4
clf = RandomForestClassifier(n_estimators = 20 , max_depth = 4)

clf.fit(train_x, train_y)

pred = clf.predict(test_x)

#Measure the score
print(f'Accuracy : {metrics.accuracy_score(pred,test_y)} ')

Accuracy : 1.0 


In [5]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [6]:
print(f' Feature importance : {clf.feature_importances_} ')

 Feature importance : [0.086628   0.01644579 0.43033162 0.4665946 ] 


In [8]:
#Less tree will have large chance to reduce the accuracy, 
#and less max_depth will simplify the decision tree in random forest
#which might cause to under-fitting problem

clf.set_params(n_estimators = 10, max_depth = 3)

clf.fit(train_x, train_y)

pred = clf.predict(test_x)

#Measure the score
print(f'Accuracy : {metrics.accuracy_score(pred,test_y)} ')
print(f' Feature importance : {clf.feature_importances_} ')

Accuracy : 1.0 
 Feature importance : [0.10854615 0.009016   0.42635225 0.4560856 ] 


In [44]:
import pandas as pd
wine = datasets.load_wine()

#It is much easier to operate the data in dataframe format
wine_df = pd.DataFrame(wine.data, columns = wine.feature_names)
wine_df.head(5)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [45]:
label_df = pd.DataFrame(wine.target)
print(label_df.head(5))
label_df.nunique()

   0
0  0
1  0
2  0
3  0
4  0


0    3
dtype: int64

**From the information of label_df, it is very clear that the wine is a classification problem**

In [46]:
#Split the train and test data for model validation
train_x, test_x, train_y, test_y = train_test_split(wine_df,label_df,test_size = 0.25 , random_state = 2019)

#build the random forest classifier for our model
#Random set some parameters to try the result first
rfc = RandomForestClassifier(n_estimators = 5,
                            max_depth = 3,
                            min_samples_split = 3,
                            min_samples_leaf = 2)

rfc.fit(train_x,train_y)
pred = rfc.predict(test_x)
print(f' Accuracy : {metrics.accuracy_score(pred,test_y)} ')

 Accuracy : 0.9555555555555556 


  # This is added back by InteractiveShellApp.init_path()


In [47]:
#Let check the feature importance
print(wine.feature_names)
rfc.feature_importances_

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


array([0.14367943, 0.        , 0.        , 0.07415576, 0.        ,
       0.11516144, 0.05501641, 0.01825323, 0.01612371, 0.09681286,
       0.20143174, 0.11804935, 0.16131607])

From the feature importance, we can see there are many features have 0 importance. I would like to drop these features and get the prediction again.

In [48]:
feature_importance_df = pd.DataFrame(rfc.feature_importances_, index = wine.feature_names , columns = ['Importances'])
feature_importance_df

Unnamed: 0,Importances
alcohol,0.143679
malic_acid,0.0
ash,0.0
alcalinity_of_ash,0.074156
magnesium,0.0
total_phenols,0.115161
flavanoids,0.055016
nonflavanoid_phenols,0.018253
proanthocyanins,0.016124
color_intensity,0.096813


In [49]:
low_importance_features = feature_importance_df[feature_importance_df.Importances == 0].index.values
wine_df_temp = wine_df.drop(low_importance_features , axis = 1)
wine_df_temp.head(5)

Unnamed: 0,alcohol,alcalinity_of_ash,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,15.6,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,11.2,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,18.6,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,16.8,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,21.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [50]:
train_x, test_x ,train_y, test_y = train_test_split(wine_df_temp,label_df,test_size = 0.25, random_state = 2019)
rfc.fit(train_x,train_y)
pred = rfc.predict(test_x)
print(f' Accuracy : {metrics.accuracy_score(pred,test_y)} ')

 Accuracy : 0.9333333333333333 


  


**The accuracy decrease! I guess the low importance features still have some help for the model. <br />
But what if we use this data and increase the complexity of model?**

In [52]:
rfc.set_params(n_estimators = 10,
               max_depth = 5,
               min_samples_split = 2,
               min_samples_leaf = 1)

rfc.fit(train_x,train_y)
pred = rfc.predict(test_x)
print(f' Accuracy : {metrics.accuracy_score(pred,test_y)} ')

 Accuracy : 0.9555555555555556 


  


**The accuracy is back! I think maybe when we lose some features, it might easily to cause some under-fitting problem, even theres features have little impact for the model But we can add some complexity to the model to avoid this problem as long as we have enough computation power.** 