In [1]:
# Import packages and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read the data from a CSV file
data = pd.read_csv('Train.csv')
data


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,A,Ship,4,1,252,5,medium,F,1,1538,1
10995,10996,B,Ship,4,1,232,5,medium,F,6,1247,0
10996,10997,C,Ship,5,4,242,5,low,F,4,1155,0
10997,10998,F,Ship,5,2,223,6,medium,M,2,1210,0


In [3]:
unique_counts = data.nunique()
print(unique_counts)


ID                     10999
Warehouse_block            5
Mode_of_Shipment           3
Customer_care_calls        6
Customer_rating            5
Cost_of_the_Product      215
Prior_purchases            8
Product_importance         3
Gender                     2
Discount_offered          65
Weight_in_gms           4034
Reached.on.Time_Y.N        2
dtype: int64


## **Data Preprocessing**

In [4]:
# Categorical(Nominal): Categories with no specific order
# Perform one-hot encoding on categorical columns 
encoded = pd.get_dummies(data[['Warehouse_block', 'Mode_of_Shipment', 'Gender']], prefix=['Warehouse_block', 'Mode_of_Shipment', 'Gender'])

# Convert the encoded columns to integers
encoded = encoded.astype(int)

# Drop the original categorical columns and concatenate the new encoded columns to the data
data = pd.concat([data.drop(['Warehouse_block', 'Mode_of_Shipment', 'Gender'], axis=1), encoded], axis=1)
data


Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Gender_F,Gender_M
0,1,4,2,177,3,low,44,1233,1,0,0,0,1,0,1,0,0,1,0
1,2,4,5,216,2,low,59,3088,1,0,0,0,0,1,1,0,0,0,1
2,3,2,2,183,4,low,48,3374,1,1,0,0,0,0,1,0,0,0,1
3,4,3,3,176,4,medium,10,1177,1,0,1,0,0,0,1,0,0,0,1
4,5,2,2,184,3,medium,46,2484,1,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,4,1,252,5,medium,1,1538,1,1,0,0,0,0,0,0,1,1,0
10995,10996,4,1,232,5,medium,6,1247,0,0,1,0,0,0,0,0,1,1,0
10996,10997,5,4,242,5,low,4,1155,0,0,0,1,0,0,0,0,1,1,0
10997,10998,5,2,223,6,medium,2,1210,0,0,0,0,0,1,0,0,1,0,1


In [5]:
# Categorical(Ordinal): Categories with a defined order
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Product_importance'] = label_encoder.fit_transform(data['Product_importance'])
data


Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Gender_F,Gender_M
0,1,4,2,177,3,1,44,1233,1,0,0,0,1,0,1,0,0,1,0
1,2,4,5,216,2,1,59,3088,1,0,0,0,0,1,1,0,0,0,1
2,3,2,2,183,4,1,48,3374,1,1,0,0,0,0,1,0,0,0,1
3,4,3,3,176,4,2,10,1177,1,0,1,0,0,0,1,0,0,0,1
4,5,2,2,184,3,2,46,2484,1,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,4,1,252,5,2,1,1538,1,1,0,0,0,0,0,0,1,1,0
10995,10996,4,1,232,5,2,6,1247,0,0,1,0,0,0,0,0,1,1,0
10996,10997,5,4,242,5,1,4,1155,0,0,0,1,0,0,0,0,1,1,0
10997,10998,5,2,223,6,2,2,1210,0,0,0,0,0,1,0,0,1,0,1


In [6]:
# Continuous Data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

data[['Cost_of_the_Product', 
      'Discount_offered', 
      'Weight_in_gms']] = scaler.fit_transform(data[['Cost_of_the_Product', 'Discount_offered', 'Weight_in_gms']])
data


Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Gender_F,Gender_M
0,1,4,2,-0.690722,3,1,1.889983,-1.468240,1,0,0,0,1,0,1,0,0,1,0
1,2,4,5,0.120746,2,1,2.815636,-0.333893,1,0,0,0,0,1,1,0,0,0,1
2,3,2,2,-0.565881,4,1,2.136824,-0.159002,1,1,0,0,0,0,1,0,0,0,1
3,4,3,3,-0.711529,4,2,-0.208162,-1.502484,1,0,1,0,0,0,1,0,0,0,1
4,5,2,2,-0.545074,3,2,2.013404,-0.703244,1,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,4,1,0.869792,5,2,-0.763553,-1.281730,1,1,0,0,0,0,0,0,1,1,0
10995,10996,4,1,0.453655,5,2,-0.455002,-1.459679,0,0,1,0,0,0,0,0,1,1,0
10996,10997,5,4,0.661724,5,1,-0.578423,-1.515937,0,0,0,1,0,0,0,0,1,1,0
10997,10998,5,2,0.266394,6,2,-0.701843,-1.482304,0,0,0,0,0,1,0,0,1,0,1


In [7]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

X = data.drop(columns=['Reached.on.Time_Y.N', 'ID'])
y = data['Reached.on.Time_Y.N']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


## **Train Base Models and Generate Predictions**

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(probability=True, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier())
]
# Train and evaluate each base learner
for name, model in base_learners:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(f"{name} Confusion Matrix:")
    print(cm)
    print(classification_report(y_test, y_pred))
    print("="*50)
    

rf Confusion Matrix:
[[ 916  396]
 [ 717 1271]]
              precision    recall  f1-score   support

           0       0.56      0.70      0.62      1312
           1       0.76      0.64      0.70      1988

    accuracy                           0.66      3300
   macro avg       0.66      0.67      0.66      3300
weighted avg       0.68      0.66      0.67      3300

svc Confusion Matrix:
[[1143  169]
 [ 952 1036]]
              precision    recall  f1-score   support

           0       0.55      0.87      0.67      1312
           1       0.86      0.52      0.65      1988

    accuracy                           0.66      3300
   macro avg       0.70      0.70      0.66      3300
weighted avg       0.73      0.66      0.66      3300

dt Confusion Matrix:
[[ 725  587]
 [ 574 1414]]
              precision    recall  f1-score   support

           0       0.56      0.55      0.56      1312
           1       0.71      0.71      0.71      1988

    accuracy                         

found 0 physical cores < 1
  File "C:\Users\hankc\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


## **Train Meta-Model and Final Prediction**

In [9]:
from sklearn.ensemble import StackingClassifier
# Define meta-learner (Logistic Regression)
meta_learner = LogisticRegression()

# Create the StackingClassifier
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_learner)

# Train the stacking model
stacking_model.fit(X_train, y_train)

# Evaluate the stacking model on the validation set
y_stacking_pred = stacking_model.predict(X_test)
stacking_cm = confusion_matrix(y_test, y_stacking_pred)
print("Stacking Model Confusion Matrix:")
print(stacking_cm)
print(classification_report(y_test, y_stacking_pred))


Stacking Model Confusion Matrix:
[[ 815  497]
 [ 675 1313]]
              precision    recall  f1-score   support

           0       0.55      0.62      0.58      1312
           1       0.73      0.66      0.69      1988

    accuracy                           0.64      3300
   macro avg       0.64      0.64      0.64      3300
weighted avg       0.65      0.64      0.65      3300

