### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score
import xgboost as xgb
from tabulate import tabulate
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector

# code to ignore warnings
import warnings
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None

root_pickle = '../pickle files/'
root_csv = '../csv files/'

### Creating a function to reduce the memory of dataframe

In [2]:
def reduce_memory(data):
    if isinstance(data, pd.DataFrame):
        start_mem_usg = data.memory_usage().sum() / 1024**2
        print("Memory usage of dataframe is:", start_mem_usg, "MB")

        for col in data.columns:
            col_type = data[col].dtype

            if col_type in ["int64", "int32", "int16"]:
                c_min = data[col].min()
                c_max = data[col].max()

                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)

            if col_type in ["float64", "float32"]:
                c_min = data[col].min()
                c_max = data[col].max()

                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)

        end_mem_usg = data.memory_usage().sum() / 1024**2
        print("Memory usage after optimization:", end_mem_usg, "MB")
        print("Reduced by {:.1f}%".format(100 * (start_mem_usg - end_mem_usg) / start_mem_usg))
        return data

    elif isinstance(data, np.ndarray):
        mem_usg = data.nbytes / 1024**2
        print("Memory usage of NumPy array is:", mem_usg, "MB")
        return data

    else:
        print("Unsupported data type. Only Pandas DataFrame and NumPy array are supported.")
        return data


### Creating a dataframe using Finaldata pickle file created during Data Preparation step

In [3]:
df = pd.read_pickle(root_pickle + 'Finaldata.pkl')
df = reduce_memory(df)
df.head(5)

Memory usage of dataframe is: 1244.6382465362549 MB
Memory usage after optimization: 1244.6382465362549 MB
Reduced by 0.0%


Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504


### Dropping unnecessary columns

In [4]:
df.drop(['user_id', 'product_id'], axis = 1, inplace = True)
df.head(5)

Unnamed: 0,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,1.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
1,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,1.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
2,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
3,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,1.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
4,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,0.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504


### We have already performed one hot encoding so there are no nominal variables in input features.

### Checking number of rows and columns for final dataframe

In [5]:
print("df :", df.shape) 

df : (8474661, 67)


### Checking missing values in dataframe

In [6]:
# check for missing values
columns = df.columns
Table_data = [["Column Name","Data Type","Missing Values"]]

for cols in columns:
    data_type = df[cols].dtype
    missing_value = df[cols].isnull().sum()
    data = [cols,data_type,missing_value]
    Table_data.append(data)
    
from tabulate import tabulate
print(tabulate(Table_data, headers="firstrow"))

print("\nConclusion: As per above data, there is no missing values in any column")

Column Name                        Data Type      Missing Values
---------------------------------  -----------  ----------------
total_product_orders_by_user       float16                     0
total_product_reorders_by_user     float16                     0
user_product_reorder_percentage    float16                     0
avg_add_to_cart_by_user            float16                     0
avg_days_since_last_bought         float16                     0
last_ordered_in                    float16                     0
is_reorder_3                       float16                     0
is_reorder_2                       float16                     0
is_reorder_1                       float16                     0
order_number                       float16                     0
order_dow                          float16                     0
order_hour_of_day                  float16                     0
days_since_prior_order             float16                     0
reordered                

### Checking columns having only "0" values in their column

In [7]:
# check for columns having "0" value only in every row.
columns_with_only_zeros = df.columns[(df == 0).all()]
print("Columns having 0 value:\n", columns_with_only_zeros)

# removing columns having "0" value only in every row as it will have no impact.
df = df.drop('is_organic', axis = 1)

# checking again if there are any "0" values
columns_with_only_zeros = df.columns[(df == 0).all()]
print("\nChecking if any column have 0 value:\n" , columns_with_only_zeros)

Columns having 0 value:
 Index(['is_organic'], dtype='object')

Checking if any column have 0 value:
 Index([], dtype='object')


### Checking value counts for output column
#### No need to perform one hot encoding on output variable as it is already a binary variable

In [8]:
print("Checking reordered count\n",df.reordered.value_counts())

Checking reordered count
 reordered
0.0    7645837
1.0     828824
Name: count, dtype: int64


### Creating separate dataframe for input and output variables

In [9]:
label = 'reordered'
x_cols = df.columns.drop('reordered')

X = df[x_cols]
y = df[label]

### Creating separate dataframe for train and test using hold out evaluation

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.25)

print("Shape for training set: ",X_train.shape)
print("\nShape for testing set: ",X_test.shape)

Shape for training set:  (6355995, 65)

Shape for testing set:  (2118666, 65)


### Getting the reorder value counts for training dataset

In [12]:
y_train.value_counts()

reordered
0.0    5734377
1.0     621618
Name: count, dtype: int64

### Getting the reorder value counts for testing dataset

In [13]:
y_test.value_counts()

reordered
0.0    1911460
1.0     207206
Name: count, dtype: int64

### Creating XGBoost DMatrix for efficient memory usage and faster speed

In [14]:
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

### Creating a watchlist to monitor the performance of the model during training

In [15]:
watchlist= [(D_train, "train")]

### Using below parameters for XGBoost model
- objective: reg:logistic: This specifies that you are performing logistic regression. It means you are dealing with a binary classification problem.
- eval_metric: logloss: This is the evaluation metric used during training. Logarithmic loss is a common metric for binary classification problems.
- eta: Learning rate, controlling the step size during the optimization.
- max_depth: Maximum depth of a tree.
- min_child_weight: Minimum sum of instance weight (hessian) needed in a child.
- gamma: Minimum loss reduction required to make a further partition on a leaf node.
- subsample: Subsample ratio of the training instances.
- colsample_bytree: Subsample ratio of columns when constructing each tree.
- alpha: L1 regularization term on weights.
- scale_pos_weight: Controls the balance of positive and negative weights. It is useful for highly imbalanced classes.
- lambda: L2 regularization term on weights.


- params=xgb_params: Specifies the hyperparameters for the model.
- dtrain=D_train: Specifies the training data.
- num_boost_round=80: Number of boosting rounds (trees) to be run.
- evals=watchlist: Specifies the watchlist for evaluation during training.
- verbose_eval=10: it prints the evaluation metric every 10 rounds.

### Creating Models

#### 01. First XGBoost Model with different hyper parameters

In [16]:
# Hyperparameter tuning
xgb_params = {
    "objective"        :"reg:logistic",
    "eval_metric"      :"logloss",
    "eta"              :0.1,
    "max_depth"        :6,
    "min_child_weight" :10,
    "gamma"            :0.70,
    "subsample"        :0.76,
    "colsample_bytree" :0.95,
    "alpha"            :2e-05,
    "scale_pos_weight" :10,
    "lambda"           :10
}

# Model creation
model = xgb.train(params=xgb_params, dtrain=D_train, num_boost_round = 80, evals = watchlist, verbose_eval = 10)

# Making predictions on test dataset
probability = model.predict(D_test)
predictions = [1 if i > 0.5 else 0 for i in probability]

# For printing evaluation metric of all model
Table_data = [["Algorithm type","Accuracy","ROC-AUC-Score"]]

# Calculating different evaluation metrics
# As target variable "classification" is binary, we will not use "macro" or"micro" for evaluation metric.
accuracy_XGBoost_1 = accuracy_score(y_test, predictions)
rocauc_XGBoost_1 = roc_auc_score(y_test, probability)

data_XGBoost_1 = ['XGBoost 1', str(accuracy_XGBoost_1), str(rocauc_XGBoost_1)]
Table_data.append(data_XGBoost_1) 

print("\nXGBoost Model 1 Evaluation Metrics:\n")
print("Accuracy Score : ",accuracy_XGBoost_1)
print("ROC-AUC Score : ",rocauc_XGBoost_1)

[0]	train-logloss:0.69785
[10]	train-logloss:0.57498
[20]	train-logloss:0.54776
[30]	train-logloss:0.53828
[40]	train-logloss:0.53421
[50]	train-logloss:0.53163
[60]	train-logloss:0.52993
[70]	train-logloss:0.52867
[79]	train-logloss:0.52764

XGBoost Model 1 Evaluation Metrics:

Accuracy Score :  0.747832362439384
ROC-AUC Score :  0.8300111792880859


#### 02. Second XGBoost Model with different hyper parameters

In [17]:
# Hyperparameter tuning
xgb_params = {
    "objective"        :"reg:logistic",
    "eval_metric"      :"logloss",
    "eta"              :0.2,
    "max_depth"        :5,
    "min_child_weight" :10,
    "gamma"            :0.75,
    "subsample"        :0.8,
    "colsample_bytree" :0.90,
    "alpha"            :2e-05,
    "scale_pos_weight" :5,
    "lambda"           :10
}

# Model creation
model = xgb.train(params=xgb_params, dtrain=D_train, num_boost_round = 80, evals = watchlist, verbose_eval = 10)

# Making predictions on test dataset
probability = model.predict(D_test)
predictions = [1 if i > 0.5 else 0 for i in probability]

# Calculating different evaluation metrics
# As target variable "classification" is binary, we will not use "macro" or"micro" for evaluation metric.
accuracy_XGBoost_2 = accuracy_score(y_test, predictions)
rocauc_XGBoost_2 = roc_auc_score(y_test, probability)

# For printing evaluation metric of all model
data_XGBoost_2 = ['XGBoost 2', str(accuracy_XGBoost_2), str(rocauc_XGBoost_2)]
Table_data.append(data_XGBoost_2) 

print("\nXGBoost Model 2 Evaluation Metrics:\n")
print("Accuracy Score : ",accuracy_XGBoost_2)
print("ROC-AUC Score : ",rocauc_XGBoost_2)

[0]	train-logloss:0.46523
[10]	train-logloss:0.38470
[20]	train-logloss:0.37683
[30]	train-logloss:0.37385
[40]	train-logloss:0.37216
[50]	train-logloss:0.37093
[60]	train-logloss:0.37013
[70]	train-logloss:0.36952
[79]	train-logloss:0.36889

XGBoost Model 2 Evaluation Metrics:

Accuracy Score :  0.8512719796324668
ROC-AUC Score :  0.831215722595705


### Overall evaluation metric for our models

In [18]:
# Printing Evaluation metric of overall models

print("Classification model evaluation metric:\n")        
print(tabulate(Table_data, headers="firstrow"))

Classification model evaluation metric:

Algorithm type      Accuracy    ROC-AUC-Score
----------------  ----------  ---------------
XGBoost 1           0.747832         0.830011
XGBoost 2           0.851272         0.831216
