# Predictive Modeling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming your dataset is already loaded into a DataFrame named df
df = pd.read_csv('cleaned_Amazon_Sale_Report.csv', encoding='ISO-8859-1', low_memory=False)

df.head()


Unnamed: 0,index,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Style,SKU,Category,...,Courier Status,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,promotion-ids,B2B
0,0,405-8078784-5731545,2022-04-30,Cancelled,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,...,Shipped,0,INR,647.62,MUMBAI,MAHARASHTRA,400081,IN,Unknown,False
1,1,171-9198151-1101146,2022-04-30,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,...,Shipped,1,INR,406.0,BENGALURU,KARNATAKA,560085,IN,Amazon PLCC Free-Financing Universal Merchant ...,False
2,2,404-0687676-7273146,2022-04-30,Shipped,Amazon,Amazon.in,Expedited,JNE3371,JNE3371-KR-XL,kurta,...,Shipped,1,INR,329.0,NAVI MUMBAI,MAHARASHTRA,410210,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,True
3,3,403-9615377-8133951,2022-04-30,Cancelled,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,...,Shipped,0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008,IN,Unknown,False
4,4,407-1069790-7240320,2022-04-30,Shipped,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,...,Shipped,1,INR,574.0,CHENNAI,TAMIL NADU,600073,IN,Unknown,False


In [2]:
# Correcting categorical columns based on your actual DataFrame
categorical_cols = [
    'Order ID', 'Status', 'Fulfilment', 'ship-service-level',
    'Style', 'SKU', 'Category', 'Size', 'ASIN', 'Courier Status',
    'currency', 'ship-city', 'ship-state', 'ship-country',
    'promotion-ids', 'B2B'
]

# Make sure the column names match exactly with those in your DataFrame
categorical_cols = [col for col in categorical_cols if col in df.columns]

# Convert categorical columns to 'category' type
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [3]:
# Define categorical features excluding 'B2B'
categorical_features = [
    'Fulfilment', 'ship-service-level', 'Style', 'SKU',
    'Category', 'Size', 'ASIN', 'Courier Status', 'currency',
    'ship-city', 'ship-state', 'ship-postal-code', 'ship-country',
    'promotion-ids'
]

# Define numerical features
numerical_features = ['Qty', 'Amount']

In [4]:
# Split data into features and target variable
X = df.drop('Status', axis=1)  # Features
y = df['Status']  # Target variable
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Create preprocessing pipelines for categorical and numerical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [6]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Create pipelines for each model
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

In [7]:
# Fit and evaluate Logistic Regression
model_name = 'Logistic Regression'
pipeline = pipelines[model_name]

print(f'Model: {model_name}')
print('------------------------')

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')


Model: Logistic Regression
------------------------
Accuracy: 0.9702


In [8]:
# Classification Report
report = classification_report(y_test, y_pred, zero_division=0)
print('Classification Report:')
print(report)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)


Classification Report:
                               precision    recall  f1-score   support

                    Cancelled       0.99      1.00      0.99      2190
                      Pending       0.69      0.60      0.64       125
Pending - Waiting for Pick Up       0.68      0.63      0.65        67
                      Shipped       1.00      1.00      1.00     15452
 Shipped - Delivered to Buyer       0.90      0.99      0.95      5743
    Shipped - Lost in Transit       0.00      0.00      0.00         2
   Shipped - Out for Delivery       0.00      0.00      0.00         9
          Shipped - Picked Up       0.91      0.18      0.31       217
  Shipped - Rejected by Buyer       0.00      0.00      0.00         1
 Shipped - Returned to Seller       0.21      0.02      0.04       390
Shipped - Returning to Seller       0.00      0.00      0.00        34

                     accuracy                           0.97     24230
                    macro avg       0.49      0.40  

In [11]:
# Fit and evaluate Decision Tree
model_name = 'Decision Tree'
pipeline = pipelines[model_name]

print(f'Model: {model_name}')
print('------------------------')

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')


Model: Decision Tree
------------------------
Accuracy: 0.9676


In [12]:
# Classification Report
report = classification_report(y_test, y_pred, zero_division=0)
print('Classification Report:')
print(report)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)


Classification Report:
                               precision    recall  f1-score   support

                    Cancelled       0.99      0.99      0.99      2190
                      Pending       0.67      0.65      0.66       125
Pending - Waiting for Pick Up       0.73      0.64      0.68        67
                      Shipped       1.00      1.00      1.00     15452
 Shipped - Delivered to Buyer       0.92      0.96      0.94      5743
    Shipped - Lost in Transit       0.00      0.00      0.00         2
   Shipped - Out for Delivery       0.29      0.22      0.25         9
          Shipped - Picked Up       0.82      0.58      0.68       217
  Shipped - Rejected by Buyer       0.00      0.00      0.00         1
 Shipped - Returned to Seller       0.16      0.08      0.11       390
Shipped - Returning to Seller       0.14      0.09      0.11        34

                     accuracy                           0.97     24230
                    macro avg       0.52      0.47  

In [14]:
# Fit and evaluate Random Forest
model_name = 'Random Forest'
pipeline = pipelines[model_name]

print(f'Model: {model_name}')
print('------------------------')

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')


Model: Random Forest
------------------------
Accuracy: 0.9723


In [15]:
# Classification Report
report = classification_report(y_test, y_pred, zero_division=0)
print('Classification Report:')
print(report)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)


Classification Report:
                               precision    recall  f1-score   support

                    Cancelled       0.99      1.00      0.99      2190
                      Pending       0.75      0.61      0.67       125
Pending - Waiting for Pick Up       0.71      0.73      0.72        67
                      Shipped       1.00      1.00      1.00     15452
 Shipped - Delivered to Buyer       0.91      1.00      0.95      5743
    Shipped - Lost in Transit       0.00      0.00      0.00         2
   Shipped - Out for Delivery       0.00      0.00      0.00         9
          Shipped - Picked Up       0.92      0.30      0.45       217
  Shipped - Rejected by Buyer       0.00      0.00      0.00         1
 Shipped - Returned to Seller       0.25      0.01      0.01       390
Shipped - Returning to Seller       0.50      0.03      0.06        34

                     accuracy                           0.97     24230
                    macro avg       0.55      0.42  