In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Load your dataset (replace with your actual dataset loading)
df = pd.read_csv('/kaggle/input/amazon-processed-data/Amazon_Processed_data.csv', encoding='ISO-8859-1' , low_memory=False)
df.head()

Unnamed: 0,index,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Style,SKU,Category,...,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,promotion-ids,B2B,fulfilled-by
0,0.0,405-8078784-5731545,2022-04-30,Cancelled,Merchant,Amazon.in,Standard,SET389,SET389-KR-NP-S,Set,...,1.0,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,,False,Easy Ship
1,1.0,171-9198151-1101146,2022-04-30,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,kurta,...,1.0,INR,406.0,BENGALURU,KARNATAKA,560085.0,IN,Amazon PLCC Free-Financing Universal Merchant ...,False,Easy Ship
2,2.0,404-0687676-7273146,2022-04-30,Shipped,Amazon,Amazon.in,Expedited,JNE3371,JNE3371-KR-XL,kurta,...,1.0,INR,329.0,NAVI MUMBAI,MAHARASHTRA,410210.0,IN,IN Core Free Shipping 2015/04/08 23-48-5-108,True,Unknown
3,3.0,403-9615377-8133951,2022-04-30,Cancelled,Merchant,Amazon.in,Standard,J0341,J0341-DR-L,Western Dress,...,1.0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,,False,Easy Ship
4,4.0,407-1069790-7240320,2022-04-30,Shipped,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,...,1.0,INR,574.0,CHENNAI,TAMIL NADU,600073.0,IN,,False,Unknown


In [3]:
# Corrected list of categorical columns based on the actual DataFrame columns
categorical_cols = [
    'Order ID', 'Status', 'Fulfilment','Sales Channel', 
    'ship-service-level', 'Style', 'SKU', 'Category', 
    'Size', 'ASIN', 'Courier Status', 'currency', 
    'ship-city', 'ship-state', 'ship-country', 
    'promotion-ids', 'B2B', 'fulfilled-by'
]

# Make sure the column names match exactly with those in your DataFrame
categorical_cols = [col for col in categorical_cols if col in df.columns]

# Convert categorical columns to 'category' type
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [4]:
# Define categorical features excluding 'B2B'
categorical_features = ['Fulfilment', 'Sales Channel', 'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN', 
                        'Courier Status', 'currency', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 
                        'promotion-ids', 'fulfilled-by']
X = df.drop('Status', axis=1)  # Features
y = df['Status']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Numerical pipeline (assuming only 'Qty' and 'Amount' are numerical)
numerical_features = ['Qty', 'Amount']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine categorical and numerical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Define logistic regression model with increased max_iter
logreg = LogisticRegression(max_iter=10000)

# Create pipeline with preprocessor and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', logreg)  # Use the logistic regression model with increased max_iter
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

In [7]:
y_pred = pipeline.predict(X_test)

In [8]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred, zero_division=0)  # Adjusted to handle zero division warning
print('Classification Report:')
print(report)

Classification Report:
                               precision    recall  f1-score   support

                    Cancelled       0.99      1.00      0.99      3631
                      Pending       0.77      0.63      0.69       132
Pending - Waiting for Pick Up       0.70      0.68      0.69        63
                      Shipped       1.00      1.00      1.00     15541
            Shipped - Damaged       0.00      0.00      0.00         1
 Shipped - Delivered to Buyer       0.91      0.99      0.95      5788
   Shipped - Out for Delivery       0.00      0.00      0.00         4
          Shipped - Picked Up       0.94      0.22      0.36       201
  Shipped - Rejected by Buyer       0.00      0.00      0.00         2
 Shipped - Returned to Seller       0.21      0.03      0.05       401
Shipped - Returning to Seller       0.00      0.00      0.00        30
                     Shipping       0.00      0.00      0.00         1

                     accuracy                       

In [9]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9722


In [10]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[ 3626     5     0     0     0     0     0     0     0     0     0     0]
 [   30    83    18     1     0     0     0     0     0     0     0     0]
 [    0    20    43     0     0     0     0     0     0     0     0     0]
 [   22     0     0 15519     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     1     0     0     0     0     0     0]
 [    0     0     0     0     0  5750     0     1     0    37     0     0]
 [    0     0     0     0     0     3     0     1     0     0     0     0]
 [    0     0     0     0     0   153     0    45     0     3     0     0]
 [    0     0     0     0     0     2     0     0     0     0     0     0]
 [    0     0     0     0     0   389     0     0     0    11     1     0]
 [    0     0     0     0     0    28     0     1     0     1     0     0]
 [    0     0     0     1     0     0     0     0     0     0     0     0]]


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Assuming you have defined preprocessor
    ('classifier', LogisticRegression(max_iter=10000))  # Adjusted max_iter as needed
])

# Define cross-validation strategy (e.g., KFold with 5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Optionally, fit on full training data and predict on test data
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Evaluate on test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {test_accuracy:.4f}')

Cross-validation scores: [0.97160302 0.97237837 0.97232991 0.97145765 0.9728145 ]
Mean CV accuracy: 0.9721166892808684
Test set accuracy: 0.9722
