In [None]:
# Import our dependencies
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import plotly.express as px
import hvplot.pandas

import warnings
warnings.filterwarnings('ignore')

In [None]:
# using pandas to execute SQL queries
# Import dependencies
from sqlalchemy import create_engine
from config import db_password
import psycopg2

#Create a connection to the RDS instance
connection = psycopg2.connect(
    host = 'instacart-db.crrysho2rjsv.us-east-2.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = 'G3DBpsW0rd',
    database='instacart'
    )
cursor=connection.cursor()

In [None]:
# Read the Datasets 
# Read the Prior Products Orders dataset
sql = "select * from order_products_prior"
orders_prior_df = pd.read_sql(sql, con=connection)
orders_prior_df

# Read the Prior Orders dataset
sql = "select * from order_prior"
allorders_df = pd.read_sql(sql, con=connection)
allorders_df

In [None]:
file_path = "../instacart-data/products.csv"
products_df = pd.read_csv(file_path)
products_df.head()

In [None]:
# Sample the Orders Prior Dataset for ML
#data = orders_prior_df.sample(2250000)
#data

In [None]:
# Join the Prior Dataset with the Main Orders Dataset to retrieve other columns
orders_df = orders_prior_df.merge(allorders_df, how="inner", on="order_id")
orders_df

In [None]:
orders_df.dropna(inplace=True)
orders_df.drop_duplicates(inplace=True)
orders_df

In [None]:
orders_df = orders_df.merge(products_df, how="left", on="product_id")
orders_df

In [None]:
# Read the Orders by Product table for getting number of orders per product
sql = "select product_id, num_of_orders from orders_by_product"
orders_prod_df = pd.read_sql(sql, con=connection)
orders_prod_df

In [None]:
orders_prod_df.describe()

In [None]:
# Find top ten Reordered Products
topten_ords_df = orders_prod_df.sort_values(ascending=False, by="num_of_orders")
topten_ords_df = topten_ords_df[:10]
topten_ords_df['product_id'] = topten_ords_df['product_id'].astype(str)

In [None]:
# using pandas to execute SQL queries
# Get the number of Reorders by product from the database table
sql = "select product_id, num_of_reorders from reorders_by_product"
reordprod_df = pd.read_sql(sql, con=connection)
reordprod_df

In [None]:
reordprod_df.describe()

In [None]:
# Find top ten Reordered Products
top_reords_df = reordprod_df.sort_values(ascending=False, by="num_of_reorders")
topten_reords_df = top_reords_df[:10]
topten_reords_df['product_id'] = topten_reords_df['product_id'].astype(str)
#px.bar(topten_reords_df, x='product_id', y='num_of_reorders', hover_data=["product_name"])

In [None]:
# Merge the Orders dataset with Reorders dataset
orders_df = orders_df.merge(reordprod_df, how="left", on="product_id")
orders_df

In [None]:
# Merge the Orders dataset with Orders by product dataset
orders_df = orders_df.merge(orders_prod_df, how="left", on="product_id")
orders_df

In [None]:
# Fill the Num_Orders and Num_Reorders column with 0 if empty
orders_df["num_of_orders"] = orders_df["num_of_orders"].fillna(0)
orders_df["num_of_reorders"] = orders_df["num_of_reorders"].fillna(0)
orders_df.dropna()
orders_df

In [None]:
orders_df.dtypes

In [None]:
# Read Department Dataset
file_path = "../instacart-data/departments.csv"
dept_df = pd.read_csv(file_path)
dept_df.head()

In [None]:
# Read Department Dataset
file_path = "../instacart-data/aisles.csv"
aisles_df = pd.read_csv(file_path)
aisles_df.head()

In [None]:
# Merge with department and aisle dataset
orders_df = orders_df.merge(dept_df, how="left", on="department_id")
orders_df = orders_df.merge(aisles_df, how="left", on="aisle_id")
orders_df

In [None]:
#Save Dataset for ProductsClustering
orders_df.to_csv("OrdersProducts_Consolidated.csv", index=None)

In [None]:
# Attempt 1 - Drop the columns that may not contribute to ML
orders_df.drop(columns=["product_id", "aisle_id", "department_id", 'user_id', "order_number", "eval_set", "order_id", "product_name"], inplace=True)
orders_df

In [None]:
# Only include data points where num_of_reords is greater than 1
orders_df = orders_df.loc[orders_df.num_of_reorders > 1]
orders_df

In [None]:
# Only include data points where num_of_orders is greater than 5
orders_df = orders_df.loc[orders_df.num_of_orders > 5]
orders_df

In [None]:
final_df = pd.get_dummies(orders_df, columns=["department", "aisle", "order_dow", "order_hour_of_day"])
final_df

In [None]:
final_df.dtypes

In [None]:
final_df.drop_duplicates(inplace=True)
#final_df.to_csv("Ordersanalysis.csv")
#final_df.drop(columns="days_since_prior_order", inplace=True)
final_df

In [None]:
# Split the Final Dataset into "Target" and "Features"
X = final_df.drop("reordered", axis=1)
y = pd.DataFrame(final_df["reordered"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled

In [None]:
def setup_nn_model(input_features:int, layer1:int, layer2:int, add_layers:int, 
                   actv_fun1:str, actv_func2:str, output_func:str,)->float:

    # Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
    number_input_features = input_features
    hidden_nodes_layer1 = layer1
    hidden_nodes_layer2 = layer2

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features ,activation=actv_fun1))

    # Second hidden layer
    nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=actv_func2 )
    )
    while (add_layers != 0):
        nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=actv_func2 )
        )
        add_layers = add_layers - 1
        
    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation=output_func))
    
    return nn


In [None]:
# Define a Training Function for the Neural Network
def train_nn(nn:float, num_epochs:int):
    # Import checkpoint dependencies
    import os
    from tensorflow.keras.callbacks import ModelCheckpoint

    # Define the checkpoint path and filenames
    os.makedirs("checkpoints/",exist_ok=True)
    checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

    # Create a callback that saves the model's weights every 5 epochs
    cp_callback = ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        save_weights_only=True,
        save_freq=50000)
    
    # Compile the model
    nn.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
    
    # Train the model
    fit_model = nn.fit(X_train_scaled, y_train, epochs=num_epochs, callbacks=[cp_callback])
    
    # Evaluate the model using the test data
    model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    # Export our model to HDF5 file
    nn.save("ProdReordering.h5")

In [None]:
y_train

In [None]:
# Attempt 1

inputs = len(X_train_scaled[0])
layer1_nodes = 3 * inputs
layer2_nodes = 2 * inputs             

# Attempt 1 - 
nn = setup_nn_model(inputs, layer1_nodes, layer2_nodes, 1, "relu", "relu", "sigmoid")

# Check the structure of the model
nn.summary()

#Train the NN
train_nn(nn, 2)

# More Attempts here to follow -

In [None]:
# Attempt 2 - Drop columns
final_df = final_df 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Use XGBoost

In [None]:
from sklearn.svm import SVC

# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")