In [37]:
# Import our dependencies
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import plotly.express as px
import hvplot.pandas

import warnings
warnings.filterwarnings('ignore')

In [38]:
# Read the Datasets 

#file_path = "../instacart-data/order_products__train.csv"
#orders_train_df = pd.read_csv(file_path)
#orders_train_df.head()

file_path = "../instacart-data/order_products__prior.csv"
orders_prior_df = pd.read_csv(file_path)
orders_prior_df.head()

# Read the cleansed Orders dataset
file_path = "../instacart-data/Orders.csv"
allorders_df = pd.read_csv(file_path)
allorders_df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [39]:
file_path = "../instacart-data/products.csv"
products_df = pd.read_csv(file_path)
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [40]:
# Sample the Orders Prior Dataset for ML
data = orders_prior_df.sample(2250000)
data

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
21290271,2245686,21616,18,1
4082397,430788,43768,3,1
9828244,1037779,5450,8,1
20770890,2191039,22594,12,0
18240937,1924176,15290,6,1
...,...,...,...,...
8867731,936165,13733,12,1
26211806,2764239,48454,9,0
30215012,3187218,13914,9,1
21021208,2217388,11760,4,1


In [41]:
# Join the Prior Dataset with the Main Orders Dataset to retrieve other columns
orders_df = data.merge(allorders_df, how="inner", on="order_id")
orders_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2245686,21616,18,1,91987,prior,70,2,10,6.0
1,2245686,30776,26,1,91987,prior,70,2,10,6.0
2,430788,43768,3,1,200964,prior,87,5,9,5.0
3,430788,10978,2,0,200964,prior,87,5,9,5.0
4,430788,22395,1,0,200964,prior,87,5,9,5.0
...,...,...,...,...,...,...,...,...,...,...
2249995,3212304,36961,1,1,28855,prior,59,1,7,4.0
2249996,2618483,8209,2,1,20069,prior,22,6,8,8.0
2249997,936165,13733,12,1,17679,prior,57,6,12,0.0
2249998,2764239,48454,9,0,102654,prior,25,1,14,9.0


In [42]:
orders_df.dropna(inplace=True)
orders_df.drop_duplicates(inplace=True)
orders_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2245686,21616,18,1,91987,prior,70,2,10,6.0
1,2245686,30776,26,1,91987,prior,70,2,10,6.0
2,430788,43768,3,1,200964,prior,87,5,9,5.0
3,430788,10978,2,0,200964,prior,87,5,9,5.0
4,430788,22395,1,0,200964,prior,87,5,9,5.0
...,...,...,...,...,...,...,...,...,...,...
2249995,3212304,36961,1,1,28855,prior,59,1,7,4.0
2249996,2618483,8209,2,1,20069,prior,22,6,8,8.0
2249997,936165,13733,12,1,17679,prior,57,6,12,0.0
2249998,2764239,48454,9,0,102654,prior,25,1,14,9.0


In [43]:
orders_df = orders_df.merge(products_df, how="left", on="product_id")
orders_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
0,2245686,21616,18,1,91987,prior,70,2,10,6.0,Organic Baby Arugula,123,4
1,2245686,30776,26,1,91987,prior,70,2,10,6.0,Organic Raw Kombucha Gingerade,31,7
2,430788,43768,3,1,200964,prior,87,5,9,5.0,Organic Bell Pepper,83,4
3,430788,10978,2,0,200964,prior,87,5,9,5.0,Organic Tomato Puree,81,15
4,430788,22395,1,0,200964,prior,87,5,9,5.0,Tomato Sauce,81,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105367,3212304,36961,1,1,28855,prior,59,1,7,4.0,Pink Cripps Apple,24,4
2105368,2618483,8209,2,1,20069,prior,22,6,8,8.0,Organic Weightless Cranberry Herbal Tea,94,7
2105369,936165,13733,12,1,17679,prior,57,6,12,0.0,Red Potatoes,83,4
2105370,2764239,48454,9,0,102654,prior,25,1,14,9.0,Manwich Original Sloppy Joe Sauce,59,15


In [44]:
# using pandas to execute SQL queries
# Import dependencies
from sqlalchemy import create_engine
from config import db_password
import psycopg2

#Create a connection to the RDS instance
connection = psycopg2.connect(
    host = 'instacart-db.crrysho2rjsv.us-east-2.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = 'G3DBpsW0rd',
    database='instacart'
    )
cursor=connection.cursor()

In [45]:
# Read the Orders by Product table for getting number of orders per product
sql = "select product_id, num_of_orders from orders_by_product"
orders_prod_df = pd.read_sql(sql, con=connection)
orders_prod_df

Unnamed: 0,product_id,num_of_orders
0,24852,18726
1,13176,15480
2,21137,10894
3,21903,9784
4,47626,8135
...,...,...
39118,46000,1
39119,45995,1
39120,46205,1
39121,45993,1


In [46]:
orders_prod_df.describe()

Unnamed: 0,product_id,num_of_orders
count,39123.0,39123.0
mean,24832.580324,35.391381
std,14347.936821,222.533412
min,1.0,1.0
25%,12453.0,2.0
50%,24836.0,5.0
75%,37262.5,18.0
max,49688.0,18726.0


In [47]:
# Find top ten Reordered Products
topten_ords_df = orders_prod_df.sort_values(ascending=False, by="num_of_orders")
topten_ords_df = topten_ords_df[:10]
topten_ords_df['product_id'] = topten_ords_df['product_id'].astype(str)

In [48]:
# using pandas to execute SQL queries
# Get the number of Reorders by product from the database table
sql = "select product_id, num_of_reorders from reorders_by_product"
reordprod_df = pd.read_sql(sql, con=connection)
reordprod_df

Unnamed: 0,product_id,num_of_reorders
0,24852,16557
1,13176,13362
2,21137,8603
3,21903,8055
4,47766,6226
...,...,...
39118,10871,0
39119,15256,0
39120,1792,0
39121,33246,0


In [49]:
reordprod_df.describe()

Unnamed: 0,product_id,num_of_reorders
count,39123.0,39123.0
mean,24832.580324,21.185083
std,14347.936821,172.122957
min,1.0,0.0
25%,12453.0,1.0
50%,24836.0,2.0
75%,37262.5,8.0
max,49688.0,16557.0


In [50]:
# Find top ten Reordered Products
top_reords_df = reordprod_df.sort_values(ascending=False, by="num_of_reorders")
topten_reords_df = top_reords_df[:10]
topten_reords_df['product_id'] = topten_reords_df['product_id'].astype(str)
#px.bar(topten_reords_df, x='product_id', y='num_of_reorders', hover_data=["product_name"])

In [51]:
# Merge the Orders dataset with Reorders dataset
orders_df = orders_df.merge(reordprod_df, how="left", on="product_id")
orders_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,num_of_reorders
0,2245686,21616,18,1,91987,prior,70,2,10,6.0,Organic Baby Arugula,123,4,2071.0
1,2245686,30776,26,1,91987,prior,70,2,10,6.0,Organic Raw Kombucha Gingerade,31,7,644.0
2,430788,43768,3,1,200964,prior,87,5,9,5.0,Organic Bell Pepper,83,4,552.0
3,430788,10978,2,0,200964,prior,87,5,9,5.0,Organic Tomato Puree,81,15,16.0
4,430788,22395,1,0,200964,prior,87,5,9,5.0,Tomato Sauce,81,15,268.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105367,3212304,36961,1,1,28855,prior,59,1,7,4.0,Pink Cripps Apple,24,4,34.0
2105368,2618483,8209,2,1,20069,prior,22,6,8,8.0,Organic Weightless Cranberry Herbal Tea,94,7,2.0
2105369,936165,13733,12,1,17679,prior,57,6,12,0.0,Red Potatoes,83,4,240.0
2105370,2764239,48454,9,0,102654,prior,25,1,14,9.0,Manwich Original Sloppy Joe Sauce,59,15,17.0


In [52]:
# Merge the Orders dataset with Orders by product dataset
orders_df = orders_df.merge(orders_prod_df, how="left", on="product_id")
orders_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,num_of_reorders,num_of_orders
0,2245686,21616,18,1,91987,prior,70,2,10,6.0,Organic Baby Arugula,123,4,2071.0,2923.0
1,2245686,30776,26,1,91987,prior,70,2,10,6.0,Organic Raw Kombucha Gingerade,31,7,644.0,855.0
2,430788,43768,3,1,200964,prior,87,5,9,5.0,Organic Bell Pepper,83,4,552.0,943.0
3,430788,10978,2,0,200964,prior,87,5,9,5.0,Organic Tomato Puree,81,15,16.0,54.0
4,430788,22395,1,0,200964,prior,87,5,9,5.0,Tomato Sauce,81,15,268.0,542.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105367,3212304,36961,1,1,28855,prior,59,1,7,4.0,Pink Cripps Apple,24,4,34.0,57.0
2105368,2618483,8209,2,1,20069,prior,22,6,8,8.0,Organic Weightless Cranberry Herbal Tea,94,7,2.0,3.0
2105369,936165,13733,12,1,17679,prior,57,6,12,0.0,Red Potatoes,83,4,240.0,525.0
2105370,2764239,48454,9,0,102654,prior,25,1,14,9.0,Manwich Original Sloppy Joe Sauce,59,15,17.0,37.0


In [55]:
# Fill the Num_Orders and Num_Reorders column with 0 if empty
orders_df["num_of_orders"] = orders_df["num_of_orders"].fillna(0)
orders_df["num_of_reorders"] = orders_df["num_of_reorders"].fillna(0)
orders_df.dropna()
orders_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,num_of_reorders,num_of_orders
0,2245686,21616,18,1,91987,prior,70,2,10,6.0,Organic Baby Arugula,123,4,2071.0,2923.0
1,2245686,30776,26,1,91987,prior,70,2,10,6.0,Organic Raw Kombucha Gingerade,31,7,644.0,855.0
2,430788,43768,3,1,200964,prior,87,5,9,5.0,Organic Bell Pepper,83,4,552.0,943.0
3,430788,10978,2,0,200964,prior,87,5,9,5.0,Organic Tomato Puree,81,15,16.0,54.0
4,430788,22395,1,0,200964,prior,87,5,9,5.0,Tomato Sauce,81,15,268.0,542.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105367,3212304,36961,1,1,28855,prior,59,1,7,4.0,Pink Cripps Apple,24,4,34.0,57.0
2105368,2618483,8209,2,1,20069,prior,22,6,8,8.0,Organic Weightless Cranberry Herbal Tea,94,7,2.0,3.0
2105369,936165,13733,12,1,17679,prior,57,6,12,0.0,Red Potatoes,83,4,240.0,525.0
2105370,2764239,48454,9,0,102654,prior,25,1,14,9.0,Manwich Original Sloppy Joe Sauce,59,15,17.0,37.0


In [56]:
orders_df.dtypes

order_id                    int64
product_id                  int64
add_to_cart_order           int64
reordered                   int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
product_name               object
aisle_id                    int64
department_id               int64
num_of_reorders           float64
num_of_orders             float64
dtype: object

In [57]:
# Read Department Dataset
file_path = "../instacart-data/departments.csv"
dept_df = pd.read_csv(file_path)
dept_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [58]:
# Read Department Dataset
file_path = "../instacart-data/aisles.csv"
aisles_df = pd.read_csv(file_path)
aisles_df.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [59]:
# Merge with department and aisle dataset
orders_df = orders_df.merge(dept_df, how="left", on="department_id")
orders_df = orders_df.merge(aisles_df, how="left", on="aisle_id")
orders_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,num_of_reorders,num_of_orders,department,aisle
0,2245686,21616,18,1,91987,prior,70,2,10,6.0,Organic Baby Arugula,123,4,2071.0,2923.0,produce,packaged vegetables fruits
1,2245686,30776,26,1,91987,prior,70,2,10,6.0,Organic Raw Kombucha Gingerade,31,7,644.0,855.0,beverages,refrigerated
2,430788,43768,3,1,200964,prior,87,5,9,5.0,Organic Bell Pepper,83,4,552.0,943.0,produce,fresh vegetables
3,430788,10978,2,0,200964,prior,87,5,9,5.0,Organic Tomato Puree,81,15,16.0,54.0,canned goods,canned jarred vegetables
4,430788,22395,1,0,200964,prior,87,5,9,5.0,Tomato Sauce,81,15,268.0,542.0,canned goods,canned jarred vegetables
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105367,3212304,36961,1,1,28855,prior,59,1,7,4.0,Pink Cripps Apple,24,4,34.0,57.0,produce,fresh fruits
2105368,2618483,8209,2,1,20069,prior,22,6,8,8.0,Organic Weightless Cranberry Herbal Tea,94,7,2.0,3.0,beverages,tea
2105369,936165,13733,12,1,17679,prior,57,6,12,0.0,Red Potatoes,83,4,240.0,525.0,produce,fresh vegetables
2105370,2764239,48454,9,0,102654,prior,25,1,14,9.0,Manwich Original Sloppy Joe Sauce,59,15,17.0,37.0,canned goods,canned meals beans


In [60]:
#Save Dataset for ProductsClustering
orders_df.to_csv("OrdersProducts_Consolidated.csv", index=None)

In [23]:
# Attempt 1 - Drop the columns that may not contribute to ML
orders_df.drop(columns=["product_id", "aisle_id", "department_id", 'user_id', "order_number", "eval_set", "order_id", "product_name"], inplace=True)
orders_df

Unnamed: 0,add_to_cart_order,reordered,order_dow,order_hour_of_day,days_since_prior_order,num_of_reorders,num_of_orders,department,aisle
0,4,1,3,10,4.0,16557.0,18726.0,produce,fresh fruits
1,33,1,6,14,7.0,397.0,686.0,produce,fresh vegetables
2,34,1,6,14,7.0,2308.0,3526.0,produce,fresh herbs
3,9,0,6,14,7.0,55.0,120.0,household,dish detergents
4,13,1,6,14,7.0,352.0,486.0,dairy eggs,yogurt
...,...,...,...,...,...,...,...,...,...
2105914,16,0,0,15,3.0,11.0,105.0,pantry,spices seasonings
2105915,19,0,5,10,5.0,1391.0,1880.0,produce,fresh fruits
2105916,17,1,5,19,4.0,160.0,260.0,deli,lunch meat
2105917,1,0,4,17,7.0,14.0,19.0,alcohol,red wines


In [24]:
# Only include data points where num_of_reords is greater than 1
orders_df = orders_df.loc[orders_df.num_of_reorders > 1]
orders_df

Unnamed: 0,add_to_cart_order,reordered,order_dow,order_hour_of_day,days_since_prior_order,num_of_reorders,num_of_orders,department,aisle
0,4,1,3,10,4.0,16557.0,18726.0,produce,fresh fruits
1,33,1,6,14,7.0,397.0,686.0,produce,fresh vegetables
2,34,1,6,14,7.0,2308.0,3526.0,produce,fresh herbs
3,9,0,6,14,7.0,55.0,120.0,household,dish detergents
4,13,1,6,14,7.0,352.0,486.0,dairy eggs,yogurt
...,...,...,...,...,...,...,...,...,...
2105914,16,0,0,15,3.0,11.0,105.0,pantry,spices seasonings
2105915,19,0,5,10,5.0,1391.0,1880.0,produce,fresh fruits
2105916,17,1,5,19,4.0,160.0,260.0,deli,lunch meat
2105917,1,0,4,17,7.0,14.0,19.0,alcohol,red wines


In [25]:
# Only include data points where num_of_orders is greater than 5
orders_df = orders_df.loc[orders_df.num_of_orders > 5]
orders_df

Unnamed: 0,add_to_cart_order,reordered,order_dow,order_hour_of_day,days_since_prior_order,num_of_reorders,num_of_orders,department,aisle
0,4,1,3,10,4.0,16557.0,18726.0,produce,fresh fruits
1,33,1,6,14,7.0,397.0,686.0,produce,fresh vegetables
2,34,1,6,14,7.0,2308.0,3526.0,produce,fresh herbs
3,9,0,6,14,7.0,55.0,120.0,household,dish detergents
4,13,1,6,14,7.0,352.0,486.0,dairy eggs,yogurt
...,...,...,...,...,...,...,...,...,...
2105914,16,0,0,15,3.0,11.0,105.0,pantry,spices seasonings
2105915,19,0,5,10,5.0,1391.0,1880.0,produce,fresh fruits
2105916,17,1,5,19,4.0,160.0,260.0,deli,lunch meat
2105917,1,0,4,17,7.0,14.0,19.0,alcohol,red wines


In [26]:
final_df = pd.get_dummies(orders_df, columns=["department", "aisle", "order_dow", "order_hour_of_day"])
final_df

Unnamed: 0,add_to_cart_order,reordered,days_since_prior_order,num_of_reorders,num_of_orders,department_alcohol,department_babies,department_bakery,department_beverages,department_breakfast,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
0,4,1,4.0,16557.0,18726.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33,1,7.0,397.0,686.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,34,1,7.0,2308.0,3526.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,9,0,7.0,55.0,120.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,13,1,7.0,352.0,486.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105914,16,0,3.0,11.0,105.0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2105915,19,0,5.0,1391.0,1880.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2105916,17,1,4.0,160.0,260.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2105917,1,0,7.0,14.0,19.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [27]:
final_df.dtypes

add_to_cart_order           int64
reordered                   int64
days_since_prior_order    float64
num_of_reorders           float64
num_of_orders             float64
                           ...   
order_hour_of_day_19        uint8
order_hour_of_day_20        uint8
order_hour_of_day_21        uint8
order_hour_of_day_22        uint8
order_hour_of_day_23        uint8
Length: 191, dtype: object

In [28]:
final_df.drop_duplicates(inplace=True)
#final_df.to_csv("Ordersanalysis.csv")
#final_df.drop(columns="days_since_prior_order", inplace=True)
final_df

Unnamed: 0,add_to_cart_order,reordered,days_since_prior_order,num_of_reorders,num_of_orders,department_alcohol,department_babies,department_bakery,department_beverages,department_breakfast,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
0,4,1,4.0,16557.0,18726.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33,1,7.0,397.0,686.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,34,1,7.0,2308.0,3526.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,9,0,7.0,55.0,120.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,13,1,7.0,352.0,486.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105914,16,0,3.0,11.0,105.0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2105915,19,0,5.0,1391.0,1880.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2105916,17,1,4.0,160.0,260.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2105917,1,0,7.0,14.0,19.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [29]:
# Split the Final Dataset into "Target" and "Features"
X = final_df.drop("reordered", axis=1)
y = pd.DataFrame(final_df["reordered"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [30]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled

array([[-0.62316578, -0.58817872, -0.38332575, ..., -0.16045502,
        -0.14262753, -0.11340948],
       [ 0.21574073, -0.58817872, -0.41323131, ..., -0.16045502,
        -0.14262753, -0.11340948],
       [ 0.21574073, -0.58817872,  2.16719066, ..., -0.16045502,
        -0.14262753, -0.11340948],
       ...,
       [ 1.61391824,  0.32251867, -0.36410076, ..., -0.16045502,
        -0.14262753, -0.11340948],
       [-0.76298354, -0.70201589,  0.5633986 , ..., -0.16045502,
        -0.14262753, -0.11340948],
       [-0.90280129, -0.47434155, -0.40810464, ..., -0.16045502,
        -0.14262753, -0.11340948]])

In [31]:
def setup_nn_model(input_features:int, layer1:int, layer2:int, add_layers:int, 
                   actv_fun1:str, actv_func2:str, output_func:str,)->float:

    # Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
    number_input_features = input_features
    hidden_nodes_layer1 = layer1
    hidden_nodes_layer2 = layer2

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features ,activation=actv_fun1))

    # Second hidden layer
    nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=actv_func2 )
    )
    while (add_layers != 0):
        nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=actv_func2 )
        )
        add_layers = add_layers - 1
        
    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation=output_func))
    
    return nn


In [32]:
# Define a Training Function for the Neural Network
def train_nn(nn:float, num_epochs:int):
    # Import checkpoint dependencies
    import os
    from tensorflow.keras.callbacks import ModelCheckpoint

    # Define the checkpoint path and filenames
    os.makedirs("checkpoints/",exist_ok=True)
    checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

    # Create a callback that saves the model's weights every 5 epochs
    cp_callback = ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        save_weights_only=True,
        save_freq=50000)
    
    # Compile the model
    nn.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
    
    # Train the model
    fit_model = nn.fit(X_train_scaled, y_train, epochs=num_epochs, callbacks=[cp_callback])
    
    # Evaluate the model using the test data
    model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    # Export our model to HDF5 file
    nn.save("ProdReordering.h5")

In [33]:
y_train

Unnamed: 0,reordered
1636519,0
940333,1
1181337,0
4619,1
1085931,1
...,...
728264,1
728814,1
687486,0
2104137,1


In [34]:
# Attempt 1

inputs = len(X_train_scaled[0])
layer1_nodes = 3 * inputs
layer2_nodes = 2 * inputs             

# Attempt 1 - 
nn = setup_nn_model(inputs, layer1_nodes, layer2_nodes, 1, "relu", "relu", "sigmoid")

# Check the structure of the model
nn.summary()

#Train the NN
train_nn(nn, 2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 570)               108870    
_________________________________________________________________
dense_1 (Dense)              (None, 380)               216980    
_________________________________________________________________
dense_2 (Dense)              (None, 380)               144780    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 381       
Total params: 471,011
Trainable params: 471,011
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
 4256/45744 [=>............................] - ETA: 4:13 - loss: 0.6144 - accuracy: 0.6661
Epoch 00002: saving model to checkpoints\weights.02.hdf5
15249/15249 - 42s - loss: 0.6122 - accuracy: 0.6685
Loss: 0.6122277975082397, Ac

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [36]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.667


In [None]:
# Use XGBoost

In [None]:
from sklearn.svm import SVC

# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")