<a href="https://colab.research.google.com/github/VishnuYelde/Whatnextweb/blob/main/ML_techniques_on_E-commerce_and_SCM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import plotly.express as px


In [3]:
df = pd.read_csv('/content/drive/MyDrive/IIT-BHU/Train.csv')

# Prune the dataset to the first 7000 rows
df = df.iloc[:7000, :]

In [4]:
print(df.head())
print(df.describe())
print(df.info())


   ID Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0   1               D           Flight                    4                2   
1   2               F           Flight                    4                5   
2   3               A           Flight                    2                2   
3   4               B           Flight                    3                3   
4   5               C           Flight                    2                2   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                  177                3                low      F   
1                  216                2                low      M   
2                  183                4                low      M   
3                  176                4             medium      M   
4                  184                3             medium      F   

   Discount_offered  Weight_in_gms  Reached.on.Time_Y.N  
0                44           1233            

In [5]:
df.drop(columns=['ID', 'Customer_care_calls'], inplace=True)

# Encode categorical columns
label_encoders = {}
categorical_columns = ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [6]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

correlation_with_target = correlation_matrix['Reached.on.Time_Y.N'].sort_values(ascending=False)
correlation_with_target

Reached.on.Time_Y.N    1.000000
Discount_offered       0.437405
Gender                 0.007310
Warehouse_block        0.002404
Mode_of_Shipment      -0.000275
Customer_rating       -0.000348
Product_importance    -0.004781
Prior_purchases       -0.045935
Cost_of_the_Product   -0.093484
Weight_in_gms         -0.397880
Name: Reached.on.Time_Y.N, dtype: float64

In [7]:
features = [
    'Warehouse_block', 'Mode_of_Shipment', 'Customer_rating',
    'Cost_of_the_Product', 'Prior_purchases', 'Product_importance',
    'Discount_offered', 'Weight_in_gms', 'Gender'
]
target = 'Reached.on.Time_Y.N'

In [8]:
# Split the data
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Train and Save Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import keras

In [10]:
# 1. K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
joblib.dump(knn, 'knn_model.pkl')

['knn_model.pkl']

In [11]:
# 2. Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
joblib.dump(dt, 'dt_model.pkl')

['dt_model.pkl']

In [12]:
# 3. Support Vector Machines
svm = SVC(probability=True)
svm.fit(X_train, y_train)
joblib.dump(svm, 'svm_model.pkl')

['svm_model.pkl']

In [13]:
# 4. Random Forest Classification
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
joblib.dump(rf, 'rf_model.pkl')

['rf_model.pkl']

In [14]:
# 5. Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
joblib.dump(gb, 'gb_model.pkl')

['gb_model.pkl']

In [32]:
# 6. Recurrent Neural Network (RNN)
X_train_rnn = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_rnn = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

rnn = Sequential()
rnn.add(SimpleRNN(50, activation='relu', input_shape=(1, X_train.shape[1])))
rnn.add(Dense(1, activation='sigmoid'))
rnn.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

rnn.fit(X_train_rnn, y_train, epochs=100, batch_size=64, verbose=0)
rnn.save('rnn_model.h5')


You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.



In [33]:
# Evaluate and Compare Model Performance
models = {
    'K-Nearest Neighbors': joblib.load('knn_model.pkl'),
    'Decision Tree': joblib.load('dt_model.pkl'),
    'Support Vector Machines': joblib.load('svm_model.pkl'),
    'Random Forest': joblib.load('rf_model.pkl'),
    'Gradient Boosting': joblib.load('gb_model.pkl'),
    'Recurrent Neural Network': keras.models.load_model('rnn_model.h5')
}

In [17]:
def get_user_input():
    warehouse_block = input("Enter Warehouse Block (A/B/C/D/F): ")
    mode_of_shipment = input("Enter Mode of Shipment (Flight/Ship/Road): ")
    customer_rating = int(input("Enter Customer Rating (1-5): "))
    cost_of_the_product = int(input("Enter Cost of the Product: "))
    prior_purchases = int(input("Enter Prior Purchases: "))
    product_importance = input("Enter Product Importance (low/medium/high): ")
    discount_offered = int(input("Enter Discount Offered: "))
    weight_in_gms = int(input("Enter Weight in grams: "))
    gender = input("Enter Gender (Male/Female): ")

    user_data = pd.DataFrame({
        'Warehouse_block': [warehouse_block],
        'Mode_of_Shipment': [mode_of_shipment],
        'Customer_rating': [customer_rating],
        'Cost_of_the_Product': [cost_of_the_product],
        'Prior_purchases': [prior_purchases],
        'Product_importance': [product_importance],
        'Discount_offered': [discount_offered],
        'Weight_in_gms': [weight_in_gms],
        'Gender': [gender]
    })

    for column in categorical_columns:
        le = label_encoders[column]
        user_data[column] = le.transform(user_data[column])

    return user_data

In [34]:
def predict_classification(models, user_input):
    user_input_rnn = user_input.values.reshape((user_input.shape[0], 1, user_input.shape[1]))
    predictions = {}
    for name, model in models.items():
        if name == 'Recurrent Neural Network':
            predictions[name] = int(model.predict(user_input_rnn)[0][0] > 0.5)
        else:
            predictions[name] = model.predict(user_input)[0]
    return predictions

In [19]:
# Get user input
user_input = get_user_input()

predictions = predict_classification(models, user_input)

for model_name, prediction in predictions.items():
    result = 'Yes' if prediction == 1 else 'No'
    print(f"{model_name} predicts the shipment will be on time: {result}")

Enter Warehouse Block (A/B/C/D/F): B
Enter Mode of Shipment (Flight/Ship/Road): Ship
Enter Customer Rating (1-5): 1
Enter Cost of the Product: 267
Enter Prior Purchases: 5
Enter Product Importance (low/medium/high): medium
Enter Discount Offered: 10
Enter Weight in grams: 1505
Enter Gender (Male/Female): M
K-Nearest Neighbors predicts the shipment will be on time: No
Decision Tree predicts the shipment will be on time: No
Support Vector Machines predicts the shipment will be on time: Yes
Random Forest predicts the shipment will be on time: No
Gradient Boosting predicts the shipment will be on time: No
Recurrent Neural Network predicts the shipment will be on time: Yes


In [35]:
# Evaluate and Compare Model Performance
results = {}
for name, model in models.items():
    if name == 'Recurrent Neural Network':
        y_pred = (model.predict(X_test_rnn) > 0.5).astype(int)
    else:
        y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'classification_report': report,
        'confusion_matrix': confusion
    }



In [36]:
# Print results
for name, result in results.items():
    print(f"Model: {name}")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print(f"F1-Score: {result['f1']}")
    print("Classification Report:")
    print(result['classification_report'])
    print("Confusion Matrix:")
    print(result['confusion_matrix'])
    print("\n")

Model: K-Nearest Neighbors
Accuracy: 0.73
Precision: 0.8170075349838536
Recall: 0.7849017580144778
F1-Score: 0.800632911392405
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.61      0.58       433
           1       0.82      0.78      0.80       967

    accuracy                           0.73      1400
   macro avg       0.69      0.70      0.69      1400
weighted avg       0.74      0.73      0.73      1400

Confusion Matrix:
[[263 170]
 [208 759]]


Model: Decision Tree
Accuracy: 0.7342857142857143
Precision: 0.8134878819810326
Recall: 0.7983453981385729
F1-Score: 0.8058455114822546
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.59      0.58       433
           1       0.81      0.80      0.81       967

    accuracy                           0.73      1400
   macro avg       0.69      0.69      0.69      1400
weighted avg       0.74      0.73      0.74      

In [37]:
# Visualize the Performance
performance_data = {
    'Model': list(results.keys()),
    'Accuracy': [result['accuracy'] for result in results.values()],
    'Precision': [result['precision'] for result in results.values()],
    'Recall': [result['recall'] for result in results.values()],
    'F1-Score': [result['f1'] for result in results.values()]
}

performance_df = pd.DataFrame(performance_data)
# Visualize the accuracy
fig = px.bar(performance_df, x='Model', y='Accuracy', title='Model Accuracy Comparison', labels={'Accuracy':'Accuracy', 'Model':'Model'})
fig.show()

# Visualize precision
fig_precision = px.bar(performance_df, x='Model', y='Precision', title='Model Precision Comparison', labels={'Precision':'Precision', 'Model':'Model'})
fig_precision.show()

# Visualize recall
fig_recall = px.bar(performance_df, x='Model', y='Recall', title='Model Recall Comparison', labels={'Recall':'Recall', 'Model':'Model'})
fig_recall.show()

# Visualize F1-score
fig_f1 = px.bar(performance_df, x='Model', y='F1-Score', title='Model F1-Score Comparison', labels={'F1-Score':'F1-Score', 'Model':'Model'})
fig_f1.show()