In [7]:
# Import our dependencies
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import plotly.express as px
import hvplot.pandas

import warnings
warnings.filterwarnings('ignore')

In [8]:
file_path = "../instacart-data/order_products__train.csv"
orders_train_df = pd.read_csv(file_path)
orders_train_df.head()

file_path = "../instacart-data/order_products__prior.csv"
orders_prior_df = pd.read_csv(file_path)
orders_prior_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [9]:
#orders_df = orders_prior_df.merge(orders_train_df, how="outer")
orders_df = orders_train_df
orders_df.count()

order_id             1384617
product_id           1384617
add_to_cart_order    1384617
reordered            1384617
dtype: int64

In [11]:
X = orders_df.drop("reordered", axis=1)
#X = X.drop("add_to_cart_order", axis=1)
y = pd.DataFrame(orders_df["reordered"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled

array([[-0.0947968 ,  0.50470409,  0.16737691],
       [-0.55660613, -1.65592449,  0.8399563 ],
       [ 1.38277954,  1.35028752, -0.37068661],
       ...,
       [-0.60540596, -0.27947895, -0.50520249],
       [ 1.47349359,  0.12206696,  1.91608334],
       [-1.71929323, -0.46070575, -0.90875012]])

In [13]:
def setup_nn_model(input_features:int, layer1:int, layer2:int, add_layers:int, 
                   actv_fun1:str, actv_func2:str, output_func:str,)->float:

    # Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
    number_input_features = input_features
    hidden_nodes_layer1 = layer1
    hidden_nodes_layer2 = layer2

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features ,activation=actv_fun1))

    # Second hidden layer
    nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=actv_func2 )
    )
    if (add_layers == 1):
        nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=actv_func2 )
    )
        
    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation=output_func))
    
    return nn


In [14]:
# Define a Training Function for the Neural Network
def train_nn(model:float, num_epochs:int):
    # Import checkpoint dependencies
    import os
    from tensorflow.keras.callbacks import ModelCheckpoint

    # Define the checkpoint path and filenames
    os.makedirs("checkpoints/",exist_ok=True)
    checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

    # Create a callback that saves the model's weights every 5 epochs
    cp_callback = ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        save_weights_only=True,
        save_freq=50000)
    
    # Compile the model
    nn.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
    
    # Train the model
    fit_model = nn.fit(X_train_scaled, y_train, epochs=num_epochs, callbacks=[cp_callback])
    
    # Evaluate the model using the test data
    model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    # Export our model to HDF5 file
    nn.save("ProdReordering.h5")

In [None]:
# Attempt 1 - 
nn = setup_nn_model(len(X_train_scaled[0]), 9, 6, 0, "relu", "relu", "sigmoid")

# Check the structure of the model
nn.summary()

#Train the NN
train_nn(nn, 50)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 9)                 36        
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 60        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 7         
Total params: 103
Trainable params: 103
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 00002: saving model to checkpoints\weights.02.hdf5
Epoch 3/50
Epoch 4/50
 2620/32452 [=>............................] - ETA: 49s - loss: 0.6619 - accuracy: 0.6035
Epoch 00004: saving model to checkpoints\weights.04.hdf5
Epoch 5/50
Epoch 00005: saving model to checkpoints\weights.05.hdf5
Epoch 6/50
Epoch 7/50
 5268/32452 [===>..........................] - ETA

Epoch 38/50
Epoch 39/50
Epoch 00039: saving model to checkpoints\weights.39.hdf5
Epoch 40/50
Epoch 41/50
 1897/32452 [>.............................] - ETA: 45s - loss: 0.6585 - accuracy: 0.6068
Epoch 00041: saving model to checkpoints\weights.41.hdf5
Epoch 42/50
Epoch 00042: saving model to checkpoints\weights.42.hdf5
Epoch 43/50
Epoch 44/50
 4555/32452 [===>..........................] - ETA: 32s - loss: 0.6597 - accuracy: 0.6074
Epoch 00044: saving model to checkpoints\weights.44.hdf5
Epoch 45/50
Epoch 00045: saving model to checkpoints\weights.45.hdf5
Epoch 46/50
Epoch 47/50
 7196/32452 [=====>........................] - ETA: 38s - loss: 0.6590 - accuracy: 0.6063
Epoch 00047: saving model to checkpoints\weights.47.hdf5
Epoch 48/50
Epoch 00048: saving model to checkpoints\weights.48.hdf5
Epoch 49/50
Epoch 50/50
Epoch 00050: saving model to checkpoints\weights.50.hdf5

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [16]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

  return f(*args, **kwargs)


 Logistic regression model accuracy: 0.598


In [None]:
# Use XGBoost

In [None]:
from sklearn.svm import SVC

# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

  return f(*args, **kwargs)
