In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from datetime import datetime

In [2]:
DF = pd.read_csv('https://raw.githubusercontent.com/rusita-ai/pyData/master/product.csv')
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15009 entries, 0 to 15008
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           15009 non-null  object 
 1   meanPriceEach  15009 non-null  float64
 2   totalOrder     15009 non-null  float64
 3   orderType_1    15009 non-null  float64
 4   orderType_2    15009 non-null  float64
 5   orderType_3    15009 non-null  float64
 6   totalBundle    15009 non-null  float64
 7   bundleType_1   15009 non-null  float64
 8   bundleType_2   15009 non-null  float64
 9   customerType   15009 non-null  object 
 10  FY             15009 non-null  int64  
 11  Country        15009 non-null  int64  
dtypes: float64(8), int64(2), object(2)
memory usage: 1.4+ MB


In [3]:
def randomForest(model_id, X, y, n_est = 100, mf = 'auto', md = None, ts = 0.3, rs = 2045, visualFeature = False):
  # split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = ts, random_state = rs)

  # model
  Model_rf = RandomForestClassifier(n_estimators = n_est, max_features = mf, max_depth = md, random_state = rs)
  Model_rf.fit(X_train, y_train)

  # predict
  y_hat = Model_rf.predict(X_test)

  if visualFeature:
    plt.figure(figsize = (9, 6))
    sns.barplot(Model_rf.feature_importances_, X.columns)
    plt.title('Feature Importance')
    plt.show()

  score_ac = accuracy_score(y_test, y_hat)
  score_f1 = f1_score(y_test, y_hat, average = None)

  return { 'model_id' : model_id, 'n_estimators' : n_est, 'max_features' : mf, 'max_depth' : md, 'accuracy_score' : score_ac, 'f1_score' : score_f1 }

In [4]:
X = DF[['meanPriceEach','totalOrder','orderType_1','orderType_2','orderType_3','totalBundle','bundleType_1','bundleType_2','FY','Country']]
y = DF['customerType']

In [5]:
DF_res = pd.DataFrame(data=None, columns=['model_id', 'n_estimators', 'max_features', 'max_depth', 'accuracy_score', 'f1_score'])

In [6]:
index = 1

In [7]:
for n in [50, 100, 150, 200, 250]:
  model_name = 'RF-' + str(index)
  index += 1
  res = randomForest(model_name, X, y, n_est=n)
  print(res)
  DF_res = DF_res.append(res, ignore_index=True)

{'model_id': 'RF-1', 'n_estimators': 50, 'max_features': 'auto', 'max_depth': None, 'accuracy_score': 0.9940039973351099, 'f1_score': array([0.99395838, 0.99404893])}
{'model_id': 'RF-2', 'n_estimators': 100, 'max_features': 'auto', 'max_depth': None, 'accuracy_score': 0.9942260715078837, 'f1_score': array([0.99418084, 0.9942706 ])}
{'model_id': 'RF-3', 'n_estimators': 150, 'max_features': 'auto', 'max_depth': None, 'accuracy_score': 0.9937819231623363, 'f1_score': array([0.99373321, 0.99382988])}
{'model_id': 'RF-4', 'n_estimators': 200, 'max_features': 'auto', 'max_depth': None, 'accuracy_score': 0.9940039973351099, 'f1_score': array([0.99395838, 0.99404893])}
{'model_id': 'RF-5', 'n_estimators': 250, 'max_features': 'auto', 'max_depth': None, 'accuracy_score': 0.9935598489895625, 'f1_score': array([0.99351085, 0.99360811])}


In [8]:
DF_res

Unnamed: 0,model_id,n_estimators,max_features,max_depth,accuracy_score,f1_score
0,RF-1,50,auto,,0.994004,"[0.993958379950772, 0.9940489310116817]"
1,RF-2,100,auto,,0.994226,"[0.9941808415398388, 0.9942706037902159]"
2,RF-3,150,auto,,0.993782,"[0.9937332139659802, 0.993829881004848]"
3,RF-4,200,auto,,0.994004,"[0.993958379950772, 0.9940489310116817]"
4,RF-5,250,auto,,0.99356,"[0.993510852539718, 0.9936081110866211]"
