## Machine Learning

### Data Loading & Preprocessing

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
sales_df = pd.read_csv("retail_price_cleaned(colab).csv")

sales_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
0,bed1,bed_bath_table,01-05-2017,1,45.95,15.1,45.95,4.0,57,5,...,89.9,3.9,15.011897,215.0,4.4,8.76,45.95,4.0,15.1,45.9
1,bed1,bed_bath_table,01-06-2017,3,137.85,12.933333,45.95,4.0,61,6,...,89.9,3.9,14.769216,209.0,4.4,21.322,45.95,4.0,12.933333,45.95
2,bed1,bed_bath_table,01-07-2017,6,275.7,14.84,45.95,4.0,123,7,...,89.9,3.9,13.993833,205.0,4.4,22.195932,45.95,4.0,14.84,45.95
3,bed1,bed_bath_table,01-08-2017,4,183.8,14.2875,45.95,4.0,90,8,...,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.2875,45.95
4,bed1,bed_bath_table,01-09-2017,2,91.9,15.1,45.95,4.0,54,9,...,89.9,3.9,18.776522,163.39871,4.4,24.324687,45.95,4.0,15.1,45.95


In [None]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   product_id             676 non-null    object 
 1   product_category_name  676 non-null    object 
 2   month_year             676 non-null    object 
 3   qty_sold               676 non-null    int64  
 4   total_price            676 non-null    float64
 5   freight_price          676 non-null    float64
 6   unit_price             676 non-null    float64
 7   product_rating         676 non-null    float64
 8   no_customers           676 non-null    int64  
 9   month                  676 non-null    int64  
 10  year                   676 non-null    int64  
 11  seasonality            676 non-null    float64
 12  volume                 676 non-null    int64  
 13  comp1_price            676 non-null    float64
 14  comp1_prod_rating      676 non-null    float64
 15  comp1_

In [None]:
## drop month_year column because we already have "month" and "year" columns seperately
sales_df = sales_df.drop(columns=["month_year"])

sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   product_id             676 non-null    object 
 1   product_category_name  676 non-null    object 
 2   qty_sold               676 non-null    int64  
 3   total_price            676 non-null    float64
 4   freight_price          676 non-null    float64
 5   unit_price             676 non-null    float64
 6   product_rating         676 non-null    float64
 7   no_customers           676 non-null    int64  
 8   month                  676 non-null    int64  
 9   year                   676 non-null    int64  
 10  seasonality            676 non-null    float64
 11  volume                 676 non-null    int64  
 12  comp1_price            676 non-null    float64
 13  comp1_prod_rating      676 non-null    float64
 14  comp1_freight_price    676 non-null    float64
 15  comp2_

#### with the considerations of id & category columns

In [None]:
## transform "product_id", "product_category_name", "month", "year" with get_dummies
id_dummies = pd.get_dummies(sales_df["product_id"])
cat_dummies = pd.get_dummies(sales_df["product_category_name"])
month_dummies = pd.get_dummies(sales_df["month"])
year_dummies = pd.get_dummies(sales_df["year"])

year_dummies.tail()

Unnamed: 0,2017,2018
671,1,0
672,1,0
673,1,0
674,1,0
675,1,0


In [None]:
## merge dummies with original dataframe
sales_df = sales_df.merge(id_dummies,left_index=True,right_index=True).merge(cat_dummies,left_index=True,right_index=True)\
.merge(month_dummies,left_index=True,right_index=True).merge(year_dummies,left_index=True,right_index=True)

sales_df = sales_df.drop(columns=["product_id", "product_category_name", "month", "year"])

sales_df.head()

Unnamed: 0,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,seasonality,volume,comp1_price,comp1_prod_rating,...,5,6,7,8,9,10,11,12,2017,2018
0,1,45.95,15.1,45.95,4.0,57,10.267394,3800,89.9,3.9,...,1,0,0,0,0,0,0,0,1,0
1,3,137.85,12.933333,45.95,4.0,61,6.503115,3800,89.9,3.9,...,0,1,0,0,0,0,0,0,1,0
2,6,275.7,14.84,45.95,4.0,123,12.071651,3800,89.9,3.9,...,0,0,1,0,0,0,0,0,1,0
3,4,183.8,14.2875,45.95,4.0,90,9.293873,3800,89.9,3.9,...,0,0,0,1,0,0,0,0,1,0
4,2,91.9,15.1,45.95,4.0,54,5.555556,3800,89.9,3.9,...,0,0,0,0,1,0,0,0,1,0


In [None]:
## apply standard scaler to the non-categorical columns
sales_scaled = StandardScaler().fit_transform(sales_df[["qty_sold", "total_price", "freight_price", "unit_price", "product_rating",\
                                                          "no_customers", "seasonality", "volume",\
                                                          "comp1_price", "comp1_prod_rating", "comp1_freight_price",\
                                                          "comp2_price", "comp2_prod_rating", "comp2_freight_price",\
                                                          "comp3_price", "comp3_prod_rating", "comp3_freight_price"]])

sales_scaled

array([[-0.87451838, -0.81039907, -0.55410682, ..., -0.80134715,
        -0.00888387, -0.51816308],
       [-0.7449175 , -0.75630413, -0.76917431, ..., -0.80134715,
        -0.00888387, -0.91002479],
       [-0.55051617, -0.67516173, -0.57991492, ..., -0.80134715,
        -0.00888387, -0.56518649],
       ...,
       [ 2.88390721,  6.28201202,  0.15024902, ..., -0.80134715,
        -0.00888387, -0.56518649],
       [ 2.43030412,  5.26957145, -0.12600162, ..., -0.80134715,
        -0.00888387, -0.66511122],
       [ 1.1342953 ,  2.23658204,  0.36155335, ..., -0.80134715,
        -0.00888387, -0.51816308]])

In [None]:
# create a dataframe with the scaled data
sales_df_scaled = pd.DataFrame(sales_scaled, columns=[["qty_sold", "total_price", "freight_price", "unit_price", "product_rating",\
                                                          "no_customers", "seasonality", "volume",\
                                                          "comp1_price", "comp1_prod_rating", "comp1_freight_price",\
                                                          "comp2_price", "comp2_prod_rating", "comp2_freight_price",\
                                                          "comp3_price", "comp3_prod_rating", "comp3_freight_price"]])
sales_df_scaled.head()

Unnamed: 0,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,seasonality,volume,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price
0,-0.874518,-0.810399,-0.554107,-0.795344,-0.368786,-0.38749,-0.367202,-0.748922,0.21813,-2.134452,-0.381476,2.468819,1.335416,-1.536064,-0.801347,-0.008884,-0.518163
1,-0.744917,-0.756304,-0.769174,-0.795344,-0.368786,-0.322984,-0.682959,-0.748922,0.21813,-2.134452,-0.407294,2.347471,1.335416,0.42081,-0.801347,-0.008884,-0.910025
2,-0.550516,-0.675162,-0.579915,-0.795344,-0.368786,0.676861,-0.215856,-0.748922,0.21813,-2.134452,-0.489785,2.266573,1.335416,0.556949,-0.801347,-0.008884,-0.565186
3,-0.680117,-0.729257,-0.634757,-0.795344,-0.368786,0.144685,-0.448863,-0.748922,0.21813,-2.134452,-0.419259,2.155536,1.335416,0.123413,-0.801347,-0.008884,-0.665111
4,-0.809718,-0.783352,-0.554107,-0.795344,-0.368786,-0.435869,-0.762442,-0.748922,0.21813,-2.134452,0.019034,1.425202,1.335416,0.88856,-0.801347,-0.008884,-0.518163


In [None]:
# replace the original data with the columns of scaled data
sales_df["qty_sold"] = sales_df_scaled["qty_sold"]
sales_df["total_price"] = sales_df_scaled["total_price"]
sales_df["freight_price"] = sales_df_scaled["freight_price"]
sales_df["unit_price"] = sales_df_scaled["unit_price"]
sales_df["product_rating"] = sales_df_scaled["product_rating"]
sales_df["no_customers"] = sales_df_scaled["no_customers"]
sales_df["seasonality"] = sales_df_scaled["seasonality"]
sales_df["volume"] = sales_df_scaled["volume"]
sales_df["comp1_price"] = sales_df_scaled["comp1_price"]
sales_df["comp1_prod_rating"] = sales_df_scaled["comp1_prod_rating"]
sales_df["comp1_freight_price"] = sales_df_scaled["comp1_freight_price"]
sales_df["comp2_price"] = sales_df_scaled["comp2_price"]
sales_df["comp2_prod_rating"] = sales_df_scaled["comp2_prod_rating"]
sales_df["comp2_freight_price"] = sales_df_scaled["comp2_freight_price"]
sales_df["comp3_price"] = sales_df_scaled["comp3_price"]
sales_df["comp3_prod_rating"] = sales_df_scaled["comp3_prod_rating"]
sales_df["comp3_freight_price"] = sales_df_scaled["comp3_freight_price"]

sales_df.head()

Unnamed: 0,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,seasonality,volume,comp1_price,comp1_prod_rating,...,5,6,7,8,9,10,11,12,2017,2018
0,-0.874518,-0.810399,-0.554107,-0.795344,-0.368786,-0.38749,-0.367202,-0.748922,0.21813,-2.134452,...,1,0,0,0,0,0,0,0,1,0
1,-0.744917,-0.756304,-0.769174,-0.795344,-0.368786,-0.322984,-0.682959,-0.748922,0.21813,-2.134452,...,0,1,0,0,0,0,0,0,1,0
2,-0.550516,-0.675162,-0.579915,-0.795344,-0.368786,0.676861,-0.215856,-0.748922,0.21813,-2.134452,...,0,0,1,0,0,0,0,0,1,0
3,-0.680117,-0.729257,-0.634757,-0.795344,-0.368786,0.144685,-0.448863,-0.748922,0.21813,-2.134452,...,0,0,0,1,0,0,0,0,1,0
4,-0.809718,-0.783352,-0.554107,-0.795344,-0.368786,-0.435869,-0.762442,-0.748922,0.21813,-2.134452,...,0,0,0,0,1,0,0,0,1,0


In [None]:
# Split preprocessed data into features and target arrays
y = sales_df["unit_price"].values
X = sales_df.drop(["unit_price"], 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  X = sales_df.drop(["unit_price"], 1).values


### Neural Networks

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu")
)

# Output layer
nn.add(
    tf.keras.layers.Dense(units=1, activation="relu")
)

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 80)                2560      
                                                                 
 dense_4 (Dense)             (None, 30)                2430      
                                                                 
 dense_5 (Dense)             (None, 1)                 31        
                                                                 
Total params: 5,021
Trainable params: 5,021
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

6/6 - 0s - loss: -1.3061e+00 - accuracy: 0.0000e+00 - 178ms/epoch - 30ms/step
Loss: -1.3060539960861206, Accuracy: 0.0
