### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score
import xgboost as xgb
from tabulate import tabulate
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector

# code to ignore warnings
import warnings
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None

root_pickle = '../pickle files/'
root_csv = '../csv files/'

### Creating a function for reducing memory

In [2]:
def reduce_mem_usage(train_data):
    start_mem = train_data.memory_usage().sum() / 1024**2
    
    for col in train_data.columns:
        col_type = train_data[col].dtype
        
        if col_type != object:
            if pd.api.types.is_categorical_dtype(train_data[col]):
                train_data[col] = train_data[col].cat.as_ordered()
            else:
                c_min = train_data[col].min()
                c_max = train_data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        train_data[col] = train_data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        train_data[col] = train_data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        train_data[col] = train_data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        train_data[col] = train_data[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        train_data[col] = train_data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        train_data[col] = train_data[col].astype(np.float32)
                    else:
                        train_data[col] = train_data[col].astype(np.float64)
    
    end_mem = train_data.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB --> {end_mem:.2f} MB (Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%)')
    
    return train_data

### Reading our final data file

In [3]:
df = pd.read_pickle(root_pickle + 'Finaldata.pkl')
df = reduce_mem_usage(df)
df.head(5)

Memory usage of dataframe is 1244.64 MB --> 1244.64 MB (Decreased by 0.0%)


Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504


### Dropping unnecessary columns

In [4]:
df.drop(['user_id', 'product_id'], axis = 1, inplace = True)
df.head(5)

Unnamed: 0,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,1.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
1,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,1.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
2,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
3,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,1.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504
4,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,0.0,11.0,...,41.0,0.694824,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504


### Checking the shape of final dataframe

In [5]:
print("df :", df.shape) 

df : (8474661, 67)


### Removing the columns having only 0 values in their rows

In [6]:
# check for columns having "0" value only in every row.
columns_with_only_zeros = df.columns[(df == 0).all()]
print("Columns having 0 value:\n", columns_with_only_zeros)

# removing columns having "0" value only in every row as it will have no impact.
df = df.drop('is_organic', axis = 1)

# checking again if there are any "0" values
columns_with_only_zeros = df.columns[(df == 0).all()]
print("\nChecking if any column have 0 value:\n" , columns_with_only_zeros)

Columns having 0 value:
 Index(['is_organic'], dtype='object')

Checking if any column have 0 value:
 Index([], dtype='object')


### Performing Normalization for better PCA

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# Fit and transform the data using MinMaxScaler
df_scaled = min_max_scaler.fit_transform(df)

# Convert the scaled array back to a DataFrame for easier inspection
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

### Separating input and output variables

In [8]:
label = 'reordered'
x_cols = df_scaled.columns.drop('reordered')

X = df_scaled[x_cols]
y = df_scaled[label]

In [9]:
X = reduce_mem_usage(X)

Memory usage of dataframe is 4202.67 MB --> 1050.67 MB (Decreased by 75.0%)


### Creating train, test dataset

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.25)

print("Shape for training set: ",X_train.shape)
print("\nShape for testing set: ",X_test.shape)

Shape for training set:  (6355995, 65)

Shape for testing set:  (2118666, 65)


In [11]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Memory usage of dataframe is 836.49 MB --> 836.49 MB (Decreased by 0.0%)
Memory usage of dataframe is 278.83 MB --> 278.83 MB (Decreased by 0.0%)


In [15]:
del X,y,df,df_scaled

### Performing PCA for dimensionality reduction

In [17]:
pca = PCA(n_components=20)  
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

### Creating XGBoost DMatrix for efficient memory usage and faster speed

In [18]:
D_train = xgb.DMatrix(X_train_pca, label=y_train)
D_test = xgb.DMatrix(X_test_pca, label=y_test)

watchlist= [(D_train, "train")]

### Creating XGBoost model for reduced features after PCA

In [19]:
# Hyperparameter tuning
xgb_params = {
    "objective"        :"reg:logistic",
    "eval_metric"      :"logloss",
    "eta"              :0.2,
    "max_depth"        :5,
    "min_child_weight" :10,
    "gamma"            :0.75,
    "subsample"        :0.8,
    "colsample_bytree" :0.90,
    "alpha"            :2e-05,
    "scale_pos_weight" :5,
    "lambda"           :10
}

# Model creation
model = xgb.train(params=xgb_params, dtrain=D_train, num_boost_round = 80, evals = watchlist, verbose_eval = 10)

# Making predictions on test dataset
probability = model.predict(D_test)
predictions = [1 if i > 0.5 else 0 for i in probability]

# Calculating different evaluation metrics
# As target variable "classification" is binary, we will not use "macro" or"micro" for evaluation metric.
accuracy_XGBoost_2 = accuracy_score(y_test, predictions)
rocauc_XGBoost_2 = roc_auc_score(y_test, probability)

# For printing evaluation metric of all model
data_XGBoost_2 = ['XGBoost 2', str(accuracy_XGBoost_2), str(rocauc_XGBoost_2)]
# Table_data.append(data_XGBoost_2) 

print("\nXGBoost Model 2 Evaluation Metrics:\n")
print("Accuracy Score : ",accuracy_XGBoost_2)
print("ROC-AUC Score : ",rocauc_XGBoost_2)

[0]	train-logloss:0.47426
[10]	train-logloss:0.40248
[20]	train-logloss:0.39512
[30]	train-logloss:0.39270
[40]	train-logloss:0.39145
[50]	train-logloss:0.39059
[60]	train-logloss:0.39001
[70]	train-logloss:0.38946
[79]	train-logloss:0.38913

XGBoost Model 2 Evaluation Metrics:

Accuracy Score :  0.8501505192418248
ROC-AUC Score :  0.8004388321717673
