In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import time

import seaborn as sns
import tensorflow as tf
import pandas as pd
import xgboost as xgb

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [None]:
!pip install openpyxl # We need this to read the excel dataset

A bit of a good practice is to put the variables here, after importing the libraries. This way it will bea easier to find where to modify the values for our model. 

In [None]:
DATA_PATH = '/kaggle/input/date-fruit-datasets/Date_Fruit_Datasets/Date_Fruit_Datasets.xlsx'
RANDOM_STATE = 42
LR = 0.01
TEST_SIZE = 0.33
MAX_DEPTH = 0
NTHREAD = 2
EVAL_METRIC = 'mlogloss'
BOOSTER = 'gbtree'
VERBOSITY = 1

# Load the data

Load the dataset found in DATA_PATH using pandas. Then gather a bit of info about the dataset and its classes.

In [None]:
df = pd.read_excel(DATA_PATH)
df

In [None]:
df.info()

In [None]:
df['Class'].unique()

## Data preprocessing

After gathering our dataset in the previous step, we need to label encode our target variables. This means that we are going to encode our unique labels (= classes) with values between 0 and n_classes-1. In this case we have 7 unique labels, which means that the we will be getting values from 0 to 6. 
<br>
The next step is to split into train and test sets for training. That way we can test our model with unseen data after it finished training.

In [None]:
le = LabelEncoder()

y = df['Class']
y = le.fit_transform(y) # Encoded labels

In [None]:
X = df.iloc[:,:-1] # Select all but the last column

In [None]:
# Split into train test sets
# TEST_SIZE is the variable which determines the % of the test set we are getting
X_train, X_test, y_train, y_test = train_test_split(
                                                    X, 
                                                    y, 
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE,
                                                    shuffle=True)

In [None]:
# shape of the dataset
print('Shape of training data :',X_train.shape)
print('Shape of testing data :',X_test.shape)

# Extreme Gradient Boosting Classifier (XGBoost)
XGBoost is a boosted tree based ensemble classifier which means it works similar to the RandomForest algorithm we have already used at some point of our learning path. What makes this algorithm interesting is that it will automatically reduce the feature set. However, it is known to be a slow algorithm. However, we are going to use a GPU so this should not be a problem. Let's try it out and measure the time of execution.

## XGBoost algorithm definition

We define the algorithm choosing the best hyper params we have found after several tests. If you want to find out more about XGBoost, click [here](https://xgboost.readthedocs.io/en/latest/index.html)

In [None]:
### Define our XGBoost model with parameters:
# Variables:
# random_state: seed to the random generator, so that your train-test splits are always deterministic
# learning_rate: Step size shrinkage used in update to prevents overfitting. 
#               After each boosting step, we can directly get the weights of new features, 
#               and learning_rate shrinks the feature weights to make the boosting process more conservative.
# booster: Which booster to use. Can be gbtree, gblinear or dart
# nthread: default to maximum number of threads available if not set
# eval_metric: Evaluation metrics for validation data, a default metric will be assigned according to objective 
#               (rmse for regression, and logloss for classification, mean average precision for ranking). Multiple can be used
# verbosity: printing messages. If 0, nothing is shown

model = xgb.XGBClassifier(
                        random_state=RANDOM_STATE,
                        learning_rate=LR,
                        booster=BOOSTER,
                        nthread=NTHREAD,
                        eval_metric=EVAL_METRIC,
                        verbosity=VERBOSITY
                        )

# Train

We want to measure the time the algoritm needs to finish training. We are calculating the time by getting the actual time and the time it is when the training is done. Then we can compute how much time it needed for the execution.

In [None]:
start = time.time() # Time before training

# Fit the model with the training data
model.fit(X_train, y_train)

end = time.time() # Time after training

# Compute how much time the model need to train
print(f'Training took {round(end-start,2)} seconds to be completed!')

# Getting the metrics

First we are going to check the performance of our model by getting the accuracy of the model. Afterwards we are computing the recall and f1-score metrics on the test set. Finally we will plot a confusion matrix to see what labels are failing the most.

In [None]:
# predict the target on the train dataset
predict_train = model.predict(X_train)
print('\nTarget on train data',predict_train) 
 
# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)
 
# predict the target on the test dataset
predict_test = model.predict(X_test)
print('\nTarget on test data',predict_test) 
 
# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

In [None]:
recall_score = round(recall_score(y_test,predict_test,average='macro'), 2)
f1_score = round(f1_score(y_test,predict_test,average='macro'), 2)
print(f'The accuracy in the test set was {round(accuracy_test, 2)}, the recall was {recall_score} and the f1 score was {f1_score}')

In [None]:
confusion_matrix = confusion_matrix(y_test, predict_test)
cm_plot = sns.heatmap(confusion_matrix,
                      annot=True, 
                      cmap='Blues',
                      fmt='d');
cm_plot.set_xlabel('Predicted Values')
cm_plot.set_ylabel('Actual Values')
cm_plot.set_title('Confusion Matrix', size=16)