In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**STEP 1: Import Required Libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

**STEP 2: Load Dataset**

In [3]:
data = pd.read_csv('df_with_stockcode_rfm.csv')

In [4]:
#Check first few rows to understand the data
print(data.shape)
print(data.head())

(12052, 1557)
   UnitPrice  QuantityAbsolute  TotalAmount      Year     Month       Day  \
0  -0.787370         -0.009966    -0.468966  0.269665  1.306861 -0.678858   
1  -0.265540          1.650565     1.793874  0.269665 -1.001465  0.480271   
2  -0.787370          2.204075     0.963211  0.269665 -0.135843 -0.910684   
3   0.281438         -0.840232    -0.762562  0.269665 -0.424384 -1.606161   
4  -1.309200          0.543544    -0.824146  0.269665  0.441239 -0.215206   

   DayOfWeek      Hour StockCode  Quantity  ... wreath  writing  yellow  \
0   0.840102 -1.216770    85049A         4  ...    0.0      0.0     0.0   
1  -0.817183 -1.624390    85099B        10  ...    0.0      0.0     0.0   
2   0.287673 -1.216770     21993        12  ...    0.0      0.0     0.0   
3  -0.264755  0.821327     22699        -1  ...    0.0      0.0     0.0   
4  -0.817183 -0.401531     23002         6  ...    0.0      0.0     0.0   

   yellowblue  youre  yuletide  zinc  Recency  Frequency  Monetary  
0  

In [5]:
#Check target column BEFORE splitting
print("Unique values in IsCancelled column:")
print(data['IsCancelled'].unique())

print("\nNumber of NaN values in IsCancelled column:")
print(data['IsCancelled'].isna().sum())

Unique values in IsCancelled column:
[False  True]

Number of NaN values in IsCancelled column:
0


In [6]:
# Convert to string, strip spaces, and uppercase
data['IsCancelled'] = data['IsCancelled'].astype(str).str.strip().str.upper()

# Map to 1/0 (TRUE = 1, FALSE = 0)
data['IsCancelled'] = data['IsCancelled'].map({'TRUE': 1, 'FALSE': 0})

# Drop rows where target is NaN (invalid values)
data = data.dropna(subset=['IsCancelled'])

print("Rows remaining after cleaning target:", data.shape[0])

Rows remaining after cleaning target: 12052


In [7]:
X = data.drop(columns=['IsCancelled'])
y = data['IsCancelled']

X = pd.get_dummies(X, drop_first=True)

In [8]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (12052, 10457)
y shape: (12052,)


**STEP 3: Split train and test**

In [160]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [161]:
#Handle NaNs in features
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

**STEP 4: Standardize**

In [183]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**STEP 5:Build MLP**

In [184]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.activations import swish

model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))  # Input layer

# First hidden layer with tanh activation
model.add(Dense(8, activation='tanh'))
model.add(Dropout(0.5))  # Dropout after first hidden layer

# Second hidden layer with tanh activation
model.add(Dense(4, activation='tanh'))
model.add(Dropout(0.5))  # Dropout after second hidden layer

# Output layer (sigmoid for binary classification)
model.add(Dense(1, activation='sigmoid'))

**STEP 6:Compile the model**

In [185]:
#optimizer = Adam(learning_rate=0.0005)
model.compile(
    #optimizer = optimizer,
    optimizer='adam', # Adaptive optimizer
    loss='binary_crossentropy',    # Suitable for binary classification
    metrics=['accuracy']           # Track accuracy during training
)

**STEP 7: Train (fit) the model**

In [186]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),  # Optional, to track test performance
    epochs=30,                          # Number of passes over the data
    batch_size=32,                      # Size of each mini-batch
    verbose=1
)

Epoch 1/30
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.5335 - loss: 0.8000 - val_accuracy: 0.7350 - val_loss: 0.5603
Epoch 2/30
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6784 - loss: 0.6037 - val_accuracy: 0.8250 - val_loss: 0.4277
Epoch 3/30
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7646 - loss: 0.4850 - val_accuracy: 0.8602 - val_loss: 0.3387
Epoch 4/30
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8482 - loss: 0.3623 - val_accuracy: 0.8851 - val_loss: 0.2912
Epoch 5/30
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8986 - loss: 0.2716 - val_accuracy: 0.8951 - val_loss: 0.2715
Epoch 6/30
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9168 - loss: 0.2238 - val_accuracy: 0.9063 - val_loss: 0.2587
Epoch 7/30
[1m302/302[0m 

In [187]:
# Probabilities
y_train_prob = model.predict(X_train)
y_test_prob = model.predict(X_test)

# Convert probabilities to 0/1 labels
y_train_pred = (y_train_prob > 0.5).astype(int)
y_test_pred = (y_test_prob > 0.5).astype(int)

[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


**STEP 8: Evaluate the Model**

In [189]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, confusion_matrix, classification_report

# F1 Scores
f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

# ROC AUC Scores
roc_train = roc_auc_score(y_train, y_train_pred)
roc_test = roc_auc_score(y_test, y_test_pred)

print("\n--- Performance Comparison (Train vs. Test) ---")
print(f"F1 Score (Training): {f1_train:.4f}")
print(f"F1 Score (Test): {f1_test:.4f}")
print(f"ROC AUC Score (Training): {roc_train:.4f}")
print(f"ROC AUC Score (Test): {roc_test:.4f}")

# Accuracy, Confusion Matrix, Classification Report
print("\nAccuracy (Test):", accuracy_score(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


--- Performance Comparison (Train vs. Test) ---
F1 Score (Training): 0.9997
F1 Score (Test): 0.9246
ROC AUC Score (Training): 0.9997
ROC AUC Score (Test): 0.9278

Accuracy (Test): 0.9274160099543758

Confusion Matrix:
 [[1163  101]
 [  74 1073]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      1264
           1       0.91      0.94      0.92      1147

    accuracy                           0.93      2411
   macro avg       0.93      0.93      0.93      2411
weighted avg       0.93      0.93      0.93      2411

