In [None]:
# Import our dependencies

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import plotly.express as px
import hvplot.pandas


In [2]:
# Read the cleansed Orders dataset
file_path = "../instacart-data/Orders.csv"
orders_df = pd.read_csv(file_path)
orders_df.head()

# Read the cleansed Products dataset
file_path = "../instacart-data/order_products__train.csv"
orders_train_df = pd.read_csv(file_path)
orders_train_df.head()

# Read the cleansed Products dataset
file_path = "../instacart-data/products.csv"
products_df = pd.read_csv(file_path)
products_df.head()

# Read the cleansed Departments dataset
file_path = "../instacart-data/departments.csv"
dept_df = pd.read_csv(file_path)
dept_df.head()

# Read the cleansed Aisles dataset
file_path = "../instacart-data/aisles.csv"
aisles_df = pd.read_csv(file_path)
aisles_df.head()

# Drop Null Values from datasets
orders_df.dropna(inplace=True)
aisles_df.dropna(inplace=True)
products_df.dropna(inplace=True)
dept_df.dropna(inplace=True)
orders_train_df.dropna(inplace=True)

In [3]:
orders_temp_df = orders_df
orders_temp_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [4]:
orders_temp_df = orders_df.loc[orders_df.days_since_prior_order.notna()]
orders_temp_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [5]:
total_df = orders_temp_df.merge(orders_train_df, how="left", on="order_id")

In [6]:
total_df = total_df.merge(products_df, how="left", on="product_id")
total_df = total_df.merge(dept_df, how="left", on="department_id")
total_df = total_df.merge(aisles_df, how="left", on="aisle_id")

In [7]:
# Create a new DataFrame that holds only the product names.
productnames = total_df["product_name"]
productnames_df = pd.DataFrame(productnames)
productnames_df.dropna(inplace=True)
productnames_df

Unnamed: 0,product_name
9,Soda
10,Organic String Cheese
11,0% Greek Strained Yogurt
12,XL Pick-A-Size Paper Towel Rolls
13,Milk Chocolate Almonds
...,...
4468277,Fabric Softener Sheets
4468278,Dark Chocolate Mint Snacking Chocolate
4468279,Phish Food Frozen Yogurt
4468280,French Baguette Bread


In [None]:
# Create a new DataFrame that holds only the department names.
deptnames = total_df["department_name"]
deptnames_df = pd.DataFrame(deptnames)
deptnames_df.dropna(inplace=True)
deptnames_df

In [9]:
#Clean the dataset
total_df = total_df.dropna()
total_df = total_df.drop(columns=["eval_set", "product_name", "department","aisle"])

In [11]:
total_df

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id
9,1187899,1,11,4,8,14.0,196.0,1.0,1.0,77.0,7.0
10,1187899,1,11,4,8,14.0,25133.0,2.0,1.0,21.0,16.0
11,1187899,1,11,4,8,14.0,38928.0,3.0,1.0,120.0,16.0
12,1187899,1,11,4,8,14.0,26405.0,4.0,1.0,54.0,17.0
13,1187899,1,11,4,8,14.0,39657.0,5.0,1.0,45.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...
4468277,272231,206209,14,6,14,30.0,40603.0,4.0,0.0,75.0,17.0
4468278,272231,206209,14,6,14,30.0,15655.0,5.0,0.0,45.0,19.0
4468279,272231,206209,14,6,14,30.0,42606.0,6.0,0.0,37.0,1.0
4468280,272231,206209,14,6,14,30.0,37966.0,7.0,0.0,112.0,3.0


In [None]:
# Calculate no.of orders per Product_id
product_orders = [][]
for product in products_df:
    int orders = 0
    for record in total_df:
        if product == total_df.product_id:
            orders = orders + 1
        else:
            product_orders.append(product.product_id, orders)
            orders = 0
product_orders            
    

In [17]:
# Use get_dummies() to create variables for text features.
#X = pd.get_dummies(total_df, columns=["department", "aisle"])
#X.drop("product_name", axis=1)

In [12]:
# Drop the text columns
final_df = total_df.dropna()
final_df

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id
9,1187899,1,11,4,8,14.0,196.0,1.0,1.0,77.0,7.0
10,1187899,1,11,4,8,14.0,25133.0,2.0,1.0,21.0,16.0
11,1187899,1,11,4,8,14.0,38928.0,3.0,1.0,120.0,16.0
12,1187899,1,11,4,8,14.0,26405.0,4.0,1.0,54.0,17.0
13,1187899,1,11,4,8,14.0,39657.0,5.0,1.0,45.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...
4468277,272231,206209,14,6,14,30.0,40603.0,4.0,0.0,75.0,17.0
4468278,272231,206209,14,6,14,30.0,15655.0,5.0,0.0,45.0,19.0
4468279,272231,206209,14,6,14,30.0,42606.0,6.0,0.0,37.0,1.0
4468280,272231,206209,14,6,14,30.0,37966.0,7.0,0.0,112.0,3.0


In [13]:
# Drop the extra columns
final_df.drop(columns=["order_id", "user_id", "order_number","order_dow","order_hour_of_day","days_since_prior_order"], inplace=True)
final_df

Unnamed: 0,product_id,add_to_cart_order,reordered,aisle_id,department_id
9,196.0,1.0,1.0,77.0,7.0
10,25133.0,2.0,1.0,21.0,16.0
11,38928.0,3.0,1.0,120.0,16.0
12,26405.0,4.0,1.0,54.0,17.0
13,39657.0,5.0,1.0,45.0,19.0
...,...,...,...,...,...
4468277,40603.0,4.0,0.0,75.0,17.0
4468278,15655.0,5.0,0.0,45.0,19.0
4468279,42606.0,6.0,0.0,37.0,1.0
4468280,37966.0,7.0,0.0,112.0,3.0


In [14]:
# Elbow curve for Products dataset
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(final_df)
   inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve - Products", xticks=k)

In [None]:
# Create an instance of K-means for Product clustering
kmeans_pmodel = KMeans(n_clusters=2, random_state=0)
kmeans_pmodel

In [None]:
# Fit the model
kmeans_pmodel.fit(final_df)

# Predict clusters
predictions = kmeans_pmodel.predict(final_df)
predictions

In [None]:
# Create a new DataFrame including predicted clusters and Products features.
clustered_df = final_df

#  Add a new column, "Product Name" to the clustered_df DataFrame.
clustered_df["product_name"] = productnames_df["product_name"]

#  Add a new column, "Department Name" to the clustered_df DataFrame.
clustered_df["department_name"] = productnames_df["department_name"]

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["Class"] = predictions 

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

In [None]:
Class_0 = clustered_df.Class == '0'
Class_0
Class_1 = clustered_df.Class == '1'
len(Class_1)

In [None]:
# Create a hvplot.scatter plot 
clustered_df.hvplot.scatter(x="", y="", by="Class", hover_cols ="product_name")

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="",
    y="",
    z="",
    #hover_name = "product_name",
    #hover_data =  ["department_name"],
    color="Class",
    symbol="Class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [35]:
# Import the Clustered Dataset
clustered_df.to_csv("ProductClusters.csv")

TypeError: to_csv() got an unexpected keyword argument 'axis'