In [5]:
# Step 1: Import necessary library
import pandas as pd

# Step 2: Load the dataset
df = pd.read_csv('dataset.csv')

# Step 3: Display first 5 rows to understand data
print(df.head())

# Step 4: Drop the 'ID' column (it’s just an identifier, not useful for prediction)
df = df.drop(columns=['ID'])

# Step 5: Flip the target labels
# Original meaning: 1 = Delayed, 0 = On time
# Business requirement: 1 = On time, 0 = Delayed
df['Reached_on_Time_Y_N'] = df['Reached.on.Time_Y.N'].apply(lambda x: 0 if x == 1 else 1)

# Step 6: Drop old target column
df = df.drop(columns=['Reached.on.Time_Y.N'])

# Step 7: Show cleaned data
print(df.head())

   ID Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0   1               D           Flight                    4                2   
1   2               F           Flight                    4                5   
2   3               A           Flight                    2                2   
3   4               B           Flight                    3                3   
4   5               C           Flight                    2                2   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                  177                3                low      F   
1                  216                2                low      M   
2                  183                4                low      M   
3                  176                4             medium      M   
4                  184                3             medium      F   

   Discount_offered  Weight_in_gms  Reached.on.Time_Y.N  
0                44           1233            

In [7]:
# Step 1: Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Step 2: Define which columns are categorical
cat_cols = ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']

# Step 3: Create an empty dictionary to store encoders (so we can reuse them later)
le_dict = {}

# Step 4: Loop over each categorical column and apply encoding
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert text to numbers
    le_dict[col] = le  # Save the encoder for later use (like in API)
    
# Step 5: Show updated data
print(df.head())


   Warehouse_block  Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0                3                 0                    4                2   
1                4                 0                    4                5   
2                0                 0                    2                2   
3                1                 0                    3                3   
4                2                 0                    2                2   

   Cost_of_the_Product  Prior_purchases  Product_importance  Gender  \
0                  177                3                   1       0   
1                  216                2                   1       1   
2                  183                4                   1       1   
3                  176                4                   2       1   
4                  184                3                   2       0   

   Discount_offered  Weight_in_gms  Reached_on_Time_Y_N  
0                44           1233            

In [8]:
# Step 1: Define features (input columns)
X = df.drop(columns=['Reached_on_Time_Y_N'])  # All columns except target

# Step 2: Define target (output column)
y = df['Reached_on_Time_Y_N']  # This is what we want to predict

# Step 3: Show shapes
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (10999, 10)
Target shape: (10999,)


In [9]:
# Step 1: Import function
from sklearn.model_selection import train_test_split

# Step 2: Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Show shapes
print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)

Training features shape: (8799, 10)
Test features shape: (2200, 10)


In [10]:
# Step 1: Import model and metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 2: Create model object
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 3: Train the model on training data
clf.fit(X_train, y_train)

# Step 4: Predict on test data
y_pred = clf.predict(X_test)

# Step 5: Print evaluation report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.65      0.70      1305
           1       0.57      0.68      0.62       895

    accuracy                           0.66      2200
   macro avg       0.66      0.67      0.66      2200
weighted avg       0.68      0.66      0.67      2200



In [11]:
# Step 1: Import joblib
import joblib

# Step 2: Save trained model
joblib.dump(clf, 'model.pkl')  

# Step 3: Save the encoders used
joblib.dump(le_dict, 'label_encoders.pkl')

print("Model and encoders saved!")


Model and encoders saved!
