In [42]:
import pandas as pd

# Load the dataset and treat "NULL" as missing
dataset = pd.read_csv("Cargo.csv", na_values=["NULL"])

# Print the length of the dataset
print("Dataset length:", len(dataset))

# Show the first few rows of the dataset
print(dataset.head())

# Display data types of the columns
print(dataset.dtypes)

# Display the shape of the dataset
print(dataset.shape)

Dataset length: 1000
              id product underwriter broker_id company_id  agent_id  \
0  n4c52a64d0464    boat        BHSI       NaN        NaN       NaN   
1  c3de6b17abe0d    boat        BHSI       NaN        NaN       NaN   
2  n3575ef11c846    boat        BHSI       NaN        NaN       NaN   
3  n99c684a6d889    boat        BHSI       NaN        NaN       NaN   
4  h25703bafabdf    boat        BHSI       NaN        NaN       NaN   

  policy_number  cover_version invoice_number transaction_type  ...  \
0           NaN              1            NaN              new  ...   
1           NaN              1            NaN              new  ...   
2           NaN              1            NaN              new  ...   
3           NaN              1            NaN              new  ...   
4           NaN              1            NaN              new  ...   

  Unnamed: 268 Unnamed: 269  Unnamed: 270 Unnamed: 271  Unnamed: 272  \
0          NaN          NaN           NaN          Na

In [43]:
# Drop all 'Unnamed:' columns
dataset = dataset.loc[:, ~dataset.columns.str.startswith('Unnamed')]

# Drop columns with more than 70% missing data
missing_threshold = 0.70
mostly_missing_cols = dataset.columns[dataset.isnull().mean() > missing_threshold].tolist()
dataset.drop(mostly_missing_cols, axis=1, inplace=True)

# Drop specific named columns if they exist
named_columns_to_drop = [
    'broker_id', 'company_id', 'agent_id', 'status', 'fsl', 'bound_timestamp', 'cbs',
    'product', 'underwriter', 'cover_version', 'agent_comm', 'agent_gst', 'rater',
    'old', 'wording', 'ahm_comm', 'refer_reason', 'purchase_price', 'discount_code',
    'id', 'group_id', 'cid', 'quote_id', 'calculation', 'ip', 'url',
    'transaction_type', 'direct', 'allow_renew', 'accounting', 'funding', 'premium', 'premium_gst', 'annual_premium_gst', 'fee',
    'comm', 'comm_gst', 'comm_premium', 'discount_gst', 'renew_sent',
    'fsl_ratio', 'gst_ratio', 'stamp_ratio', 'referral_timestamp', 'liability'
]

columns_present = [col for col in named_columns_to_drop if col in dataset.columns]
dataset.drop(columns=columns_present, axis=1, inplace=True)

dataset.to_excel("Cargo_cleaned_final.xlsx", index=False)

dataset = pd.read_excel("Cargo_cleaned_final.xlsx")

# Confirm cleanup
print("Cleaned dataset shape:", dataset.shape)






Cleaned dataset shape: (1000, 25)


In [44]:
import pandas as pd

# Step 1: Load the Excel file
dataset = pd.read_excel("Cargo_cleaned_final.xlsx")

# Step 2: Drop malformed 'category' rows
malformed_value = 'LegalName:{"EffectiveFrom":null'
dataset = dataset[dataset['category'] != malformed_value]

# Step 3: Drop rows with missing values
dataset.dropna(inplace=True)

# Step 4: Drop duplicate rows
dataset.drop_duplicates(inplace=True)

# Step 5: Save the cleaned dataset
dataset.to_excel("Cargo_cleaned_final_cleaned.xlsx", index=False)

# Step 6: Confirm shape
print("✅ Final cleaned dataset shape:", dataset.shape)


✅ Final cleaned dataset shape: (767, 25)


In [45]:
# ✅ Use this instead:
dataset = pd.read_excel("Cargo_cleaned_final_cleaned.xlsx")

# Step 2: Replace 'dec' with 'no' in 'allow' (declined = bad credit)
dataset['allow'] = dataset['allow'].replace('dec', 'no')

# Step 3: Binary mapping
binary_map = {'yes': 1, 'no': 0}
dataset['allow'] = dataset['allow'].map(binary_map)
dataset['pro_built'] = dataset['pro_built'].map(binary_map)
dataset['water_skiers'] = dataset['water_skiers'].map(binary_map)

# Step 4: Handle quote_timestamp (extract features)
if 'quote_timestamp' in dataset.columns:
    dataset['quote_timestamp'] = pd.to_datetime(dataset['quote_timestamp'], errors='coerce')
    dataset['quote_hour'] = dataset['quote_timestamp'].dt.hour
    dataset['quote_dayofweek'] = dataset['quote_timestamp'].dt.dayofweek
    dataset['quote_month'] = dataset['quote_timestamp'].dt.month
    dataset.drop('quote_timestamp', axis=1, inplace=True)

# Step 5: Convert power_speed to categories
if 'power_speed' in dataset.columns:
    dataset['power_speed'] = dataset['power_speed'].replace('RecordLastUpdatedDate:null}', pd.NA)
    speed_bins = {
        'Up to 10 knots': 'Low',
        'Up to 20 knots': 'Low',
        'Up to 30 knots': 'Medium',
        'Up to 40 knots': 'Medium',
        'Up to 50 knots': 'High',
        'Up to 61 knots': 'High'
    }
    dataset['power_speed_category'] = dataset['power_speed'].map(speed_bins)
    dataset.drop('power_speed', axis=1, inplace=True)
    dataset = pd.get_dummies(dataset, columns=['power_speed_category'], drop_first=True)

# Step 6: One-hot encode selected nominal columns
desired_nominal_columns = [
    'boat_type', 'hull_material', 'storage_method',
    'storage_state', 'underwriter_id', 'cover_type'
]
nominal_columns_present = [col for col in desired_nominal_columns if col in dataset.columns]
dataset = pd.get_dummies(dataset, columns=nominal_columns_present, drop_first=True)

# Step 7: Convert any remaining boolean columns to 0/1
bool_cols = dataset.select_dtypes(include='bool').columns
dataset[bool_cols] = dataset[bool_cols].astype(int)

# Step 8: Save the final version
dataset.to_excel("Cargo_final_timestamp_speed_encoded.xlsx", index=False)

# Step 9: Confirm shape
print("Final dataset shape:", dataset.shape)



Final dataset shape: (767, 53)


In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# (plus any other imports you already have: pandas as pd, etc.)

# Step 1: Drop rows with any missing values (NaNs)
dataset_clean = dataset.dropna()

# Step 2: Ensure only numeric features are used
X = dataset_clean.drop('allow', axis=1)
y = dataset_clean['allow']

# Filter only numeric columns
X_numeric = X.select_dtypes(include=['number'])

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y, test_size=0.2, random_state=42, stratify=y
)

# Step 4: Normalize
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Step 5: Confirm shape
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape: ", X_test_scaled.shape)




X_train_scaled shape: (580, 51)
X_test_scaled shape:  (146, 51)


In [47]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

kernels = ['linear', 'rbf', 'sigmoid', 'poly']

for kernel in kernels:
    print(f"\n SVM with kernel = '{kernel}'")
    
    # Initialize and train
    model = SVC(kernel=kernel, gamma='scale', random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate
    print(" Accuracy:", accuracy_score(y_test, y_pred))
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(" Classification Report:\n", classification_report(y_test, y_pred))




 SVM with kernel = 'linear'
 Accuracy: 0.7876712328767124
 Confusion Matrix:
 [[ 15  27]
 [  4 100]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.36      0.49        42
           1       0.79      0.96      0.87       104

    accuracy                           0.79       146
   macro avg       0.79      0.66      0.68       146
weighted avg       0.79      0.79      0.76       146


 SVM with kernel = 'rbf'
 Accuracy: 0.7671232876712328
 Confusion Matrix:
 [[ 12  30]
 [  4 100]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.29      0.41        42
           1       0.77      0.96      0.85       104

    accuracy                           0.77       146
   macro avg       0.76      0.62      0.63       146
weighted avg       0.76      0.77      0.73       146


 SVM with kernel = 'sigmoid'
 Accuracy: 0.7397260273972602
 Confusion Matrix:
 [[  8  34]
 [  

In [48]:
from sklearn.model_selection import GridSearchCV

# Step 1: Define the hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 50],
    'gamma': ['scale', 0.01, 0.1, 1],
    'kernel': ['rbf']
}

# Step 2: Set up the grid search
grid = GridSearchCV(SVC(), param_grid, cv=10, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

# Step 3: Display the best parameters and cross-validation score
print(" Best Parameters:", grid.best_params_)
print(" Best CV Accuracy:", grid.best_score_)

# Step 4: Evaluate the best model on the test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print(" Test Accuracy:", accuracy_score(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))


 Best Parameters: {'C': 50, 'gamma': 0.1, 'kernel': 'rbf'}
 Best CV Accuracy: 0.8327586206896551
 Test Accuracy: 0.863013698630137

 Confusion Matrix:
 [[31 11]
 [ 9 95]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.74      0.76        42
           1       0.90      0.91      0.90       104

    accuracy                           0.86       146
   macro avg       0.84      0.83      0.83       146
weighted avg       0.86      0.86      0.86       146



In [49]:
from sklearn.tree import DecisionTreeClassifier

# Step 1: Train the Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_scaled, y_train)

# Step 2: Predict on the test set
y_pred_tree = tree_model.predict(X_test_scaled)

# Step 3: Evaluate model
print(" Accuracy:", accuracy_score(y_test, y_pred_tree))
cm = confusion_matrix(y_test, y_pred_tree)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred_tree))

# Step 4: Misclassification count (FP + FN)
tn, fp, fn, tp = cm.ravel()
misclassified = fp + fn
print(f"\n Total Misclassifications (FP + FN): {misclassified}")



 Accuracy: 0.8561643835616438

 Confusion Matrix:
 [[34  8]
 [13 91]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.81      0.76        42
           1       0.92      0.88      0.90       104

    accuracy                           0.86       146
   macro avg       0.82      0.84      0.83       146
weighted avg       0.86      0.86      0.86       146


 Total Misclassifications (FP + FN): 21


In [50]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Step 2: Predict on the test set
y_pred_rf = rf_model.predict(X_test_scaled)

# Step 3: Evaluate performance
print(" Accuracy:", accuracy_score(y_test, y_pred_rf))
cm = confusion_matrix(y_test, y_pred_rf)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred_rf))

# Step 4: Count total misclassifications (FP + FN)
tn, fp, fn, tp = cm.ravel()
misclassified = fp + fn
print(f"\n Total Misclassifications (FP + FN): {misclassified}")



 Accuracy: 0.910958904109589

 Confusion Matrix:
 [[34  8]
 [ 5 99]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.81      0.84        42
           1       0.93      0.95      0.94       104

    accuracy                           0.91       146
   macro avg       0.90      0.88      0.89       146
weighted avg       0.91      0.91      0.91       146


 Total Misclassifications (FP + FN): 13


In [51]:
# Ensure xgboost is installed
!pip install xgboost

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Initialize and train XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)  # Removed deprecated use_label_encoder
xgb_model.fit(X_train_scaled, y_train)

# Step 2: Predict
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Step 3: Evaluate
print(" Accuracy:", accuracy_score(y_test, y_pred_xgb))
cm = confusion_matrix(y_test, y_pred_xgb)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred_xgb))

# Step 4: Misclassification count (FP + FN)
tn, fp, fn, tp = cm.ravel()
misclassified = fp + fn
print(f"\n Total Misclassifications (FP + FN): {misclassified}")


 Accuracy: 0.9178082191780822

 Confusion Matrix:
 [[35  7]
 [ 5 99]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.85        42
           1       0.93      0.95      0.94       104

    accuracy                           0.92       146
   macro avg       0.90      0.89      0.90       146
weighted avg       0.92      0.92      0.92       146


 Total Misclassifications (FP + FN): 12


In [52]:
from sklearn.naive_bayes import GaussianNB

# Step 1: Train the model
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

# Step 2: Predict
y_pred_nb = nb_model.predict(X_test_scaled)

# Step 3: Evaluate
print(" Accuracy:", accuracy_score(y_test, y_pred_nb))
cm = confusion_matrix(y_test, y_pred_nb)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred_nb))

# Step 4: Total misclassifications (FP + FN)
tn, fp, fn, tp = cm.ravel()
misclassified = fp + fn
print(f"\n Total Misclassifications (FP + FN): {misclassified}")



 Accuracy: 0.7808219178082192

 Confusion Matrix:
 [[ 11  31]
 [  1 103]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.26      0.41        42
           1       0.77      0.99      0.87       104

    accuracy                           0.78       146
   macro avg       0.84      0.63      0.64       146
weighted avg       0.81      0.78      0.73       146


 Total Misclassifications (FP + FN): 32


In [53]:
from sklearn.naive_bayes import GaussianNB

# Step 1: Train the model
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

# Step 2: Predict
y_pred_nb = nb_model.predict(X_test_scaled)

# Step 3: Evaluate
print(" Accuracy:", accuracy_score(y_test, y_pred_nb))
cm = confusion_matrix(y_test, y_pred_nb)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred_nb))

# Step 4: Total misclassifications (FP + FN)
tn, fp, fn, tp = cm.ravel()
misclassified = fp + fn
print(f"\n Total Misclassifications (FP + FN): {misclassified}")

# Step 5: Custom cost matrix evaluation based on your business priority
# Priority: Minimize FN (good credit wrongly rejected)
cost_fp = 1   # Less severe
cost_fn = 10  # Very costly
total_custom_cost = (fp * cost_fp) + (fn * cost_fn)

print(f"\n Custom Cost (FP×1 + FN×10): {total_custom_cost}")


 Accuracy: 0.7808219178082192

 Confusion Matrix:
 [[ 11  31]
 [  1 103]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.26      0.41        42
           1       0.77      0.99      0.87       104

    accuracy                           0.78       146
   macro avg       0.84      0.63      0.64       146
weighted avg       0.81      0.78      0.73       146


 Total Misclassifications (FP + FN): 32

 Custom Cost (FP×1 + FN×10): 41


In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the model with default C and no class weighting
log_reg = LogisticRegression(
    max_iter=5000,
    random_state=2,
    solver='lbfgs'  # Optional but good to show explicitly
)

# Perform 10-fold cross-validation
cv_scores = cross_val_score(log_reg, X_train_scaled, y_train, cv=10, scoring='accuracy')

# Print results
print(" 10-Fold Cross-Validation Accuracies:", np.round(cv_scores, 3))
print(" Mean CV Accuracy:", np.round(cv_scores.mean(), 3))


 10-Fold Cross-Validation Accuracies: [0.724 0.776 0.793 0.724 0.793 0.655 0.741 0.793 0.81  0.672]
 Mean CV Accuracy: 0.748


In [55]:
from sklearn.metrics import accuracy_score

# Step 1: Define and train the model
log_reg = LogisticRegression(max_iter=5000, random_state=2, solver='lbfgs')
log_reg.fit(X_train_scaled, y_train)

# Step 2: Predict on the test set
y_pred = log_reg.predict(X_test_scaled)

# Step 3: Evaluate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(" Test Set Accuracy:", round(test_accuracy, 3))


 Test Set Accuracy: 0.747


In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10],
    'solver': ['liblinear', 'saga']
}

# Setup the model
log_reg = LogisticRegression(max_iter=5000, random_state=2)

# Grid Search with 10-fold cross-validation
grid = GridSearchCV(log_reg, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

# Best parameters and CV score
print(" Best Parameters:", grid.best_params_)
print(" Best Cross-Validation Accuracy:", round(grid.best_score_, 3))

# Test accuracy of best model
best_model = grid.best_estimator_
y_pred_test = best_model.predict(X_test_scaled)
test_acc = accuracy_score(y_test, y_pred_test)
print(" Test Set Accuracy:", round(test_acc, 3))


 Best Parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
 Best Cross-Validation Accuracy: 0.852
 Test Set Accuracy: 0.87


In [57]:
# !pip install tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#  Step 1: Define the model
model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),  # Clean input layer
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

#  Step 2: Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

#  Step 3: Train the model
history = model.fit(
    X_train_scaled, y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.1,
    verbose=1  # You can set this to 0 for silent training
)

#  Step 4: Predict on the test set
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba >= 0.5).astype(int)

#  Step 5: Evaluate the model
print("\n Test Accuracy:", accuracy_score(y_test, y_pred))

# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred))

Epoch 1/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6281 - loss: 0.6506 - val_accuracy: 0.6034 - val_loss: 0.6746
Epoch 2/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7291 - loss: 0.5628 - val_accuracy: 0.6034 - val_loss: 0.6479
Epoch 3/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7286 - loss: 0.5367 - val_accuracy: 0.6724 - val_loss: 0.6173
Epoch 4/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7750 - loss: 0.4846 - val_accuracy: 0.6724 - val_loss: 0.5842
Epoch 5/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7669 - loss: 0.4735 - val_accuracy: 0.6552 - val_loss: 0.5925
Epoch 6/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7770 - loss: 0.4592 - val_accuracy: 0.6552 - val_loss: 0.5739
Epoch 7/50
[1m33/33[0m [32m━━━━━━━━━━

In [58]:
# !pip install tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#  Step 1: Define the model
model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),  # Input layer
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

#  Step 2: Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

#  Step 3: Add early stopping
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

#  Step 4: Train the model
history = model.fit(
    X_train_scaled, y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

#  Step 5a: Evaluate on training set
y_train_pred_proba = model.predict(X_train_scaled)
y_train_pred = (y_train_pred_proba >= 0.5).astype(int)
print("\n Training Accuracy:", accuracy_score(y_train, y_train_pred))

#  Step 5b: Evaluate on test set
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba >= 0.5).astype(int)
print("\n Test Accuracy:", accuracy_score(y_test, y_pred))

#  Step 6: Final evaluation metrics
cm = confusion_matrix(y_test, y_pred)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7092 - loss: 0.6267 - val_accuracy: 0.6034 - val_loss: 0.6843
Epoch 2/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7084 - loss: 0.5677 - val_accuracy: 0.6207 - val_loss: 0.6476
Epoch 3/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7554 - loss: 0.5107 - val_accuracy: 0.6897 - val_loss: 0.6016
Epoch 4/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7477 - loss: 0.5117 - val_accuracy: 0.6897 - val_loss: 0.5919
Epoch 5/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7933 - loss: 0.4446 - val_accuracy: 0.6724 - val_loss: 0.5790
Epoch 6/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7970 - loss: 0.4162 - val_accuracy: 0.6379 - val_loss: 0.5517
Epoch 7/50
[1m33/33[0m [32m━━━━━━━━━━

In [59]:
# !pip install tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Define the deeper model
model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Step 2: Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Step 3: Train the model
history = model.fit(
    X_train_scaled, y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.1,
    verbose=1
)

# Step 4: Predict on the test set
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba >= 0.5).astype(int)

# Step 5: Evaluate the model
print("\n Test Accuracy:", accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\n Confusion Matrix:\n", cm)
print("\n Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7434 - loss: 0.6158 - val_accuracy: 0.6034 - val_loss: 0.6355
Epoch 2/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7219 - loss: 0.5297 - val_accuracy: 0.6034 - val_loss: 0.5976
Epoch 3/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7341 - loss: 0.4889 - val_accuracy: 0.6552 - val_loss: 0.6363
Epoch 4/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7883 - loss: 0.4630 - val_accuracy: 0.6724 - val_loss: 0.5895
Epoch 5/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7856 - loss: 0.4216 - val_accuracy: 0.6897 - val_loss: 0.5931
Epoch 6/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7979 - loss: 0.4111 - val_accuracy: 0.6897 - val_loss: 0.5808
Epoch 7/50
[1m33/33[0m [32m━━━━━━━━━━

In [60]:
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier

#  Load your final feature file
df_final = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")

#  Split into X, y
X_final = df_final.drop("allow", axis=1)
y_final = df_final["allow"]

#  Identify numeric vs. categorical
numeric_cols     = X_final.select_dtypes(include="number").columns.tolist()
categorical_cols = X_final.select_dtypes(include="object").columns.tolist()

#  Build the ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", MinMaxScaler(),               numeric_cols),
    ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_cols),
])

#  Chain into a Pipeline
pipeline_final = Pipeline([
    ("prep", preprocessor),
    ("clf", XGBClassifier(eval_metric="logloss", random_state=42)),
])

#  Fit on all data
pipeline_final.fit(X_final, y_final)

#  Save to disk
joblib.dump(pipeline_final, "xgb_credit_risk_pipeline_final.pkl")
print(" Pipeline trained and saved as xgb_credit_risk_pipeline_final.pkl")




 Pipeline trained and saved as xgb_credit_risk_pipeline_final.pkl


In [61]:
import os
import joblib
import pandas as pd

# 0. (Optional) Verify files
print("Files in cwd:", os.listdir())

# 1. Load your trained pipeline
pipeline = joblib.load("xgb_credit_risk_pipeline_final.pkl")

# 2. Read the cleaned feature file
df = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")
feature_cols = df.drop("allow", axis=1).columns.tolist()

# 3. Split features by dtype
numeric_cols     = df[feature_cols].select_dtypes(include=['number']).columns.tolist()
categorical_cols = df[feature_cols].select_dtypes(include=['object', 'category']).columns.tolist()

# 4. Build dummy record
dummy = {}
for col in feature_cols:
    if col in numeric_cols:
        dummy[col] = 0                    
    else:
        dummy[col] = df[col].mode()[0]   

# 5. Tweak a few numerics for a “sensible” scenario
dummy.update({
    "calculated_premium": 50_000,
    "quote_hour":            9,
    "quote_dayofweek":       0,
    "quote_month":          12
})

X_dummy = pd.DataFrame([dummy], columns=feature_cols)

# 6. Predict
pred_class = pipeline.predict(X_dummy)[0]
pred_proba = pipeline.predict_proba(X_dummy)[0]

print(f"Predicted class: {pred_class}   (1 = approved/good risk, 0 = declined/bad risk)")
print(f"Probabilities → declined(0): {pred_proba[0]:.3f}, approved(1): {pred_proba[1]:.3f}")




Files in cwd: ['.anaconda', '.conda', '.condarc', '.continuum', '.ipynb_checkpoints', '.ipython', '.jupyter', '.keras', '.matplotlib', 'app.py', 'AppData', 'Application Data', 'batch_score.py', 'Cargo.csv', 'Cargo.ipynb', 'Cargo_cleaned_final.xlsx', 'Cargo_cleaned_final_cleaned.xlsx', 'Cargo_encoded_final.xlsx', 'Cargo_encoded_nominal.xlsx', 'Cargo_final_timestamp_speed_encoded.xlsx', 'Contacts', 'Cookies', 'Desktop', 'Documents', 'Downloads', 'Favorites', 'Links', 'Local Settings', 'Microsoft', 'Music', 'My Documents', 'NetHood', 'new_apps.csv', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{157062e8-0d6a-11f0-8df8-f7a9bdb53740}.TM.blf', 'NTUSER.DAT{157062e8-0d6a-11f0-8df8-f7a9bdb53740}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{157062e8-0d6a-11f0-8df8-f7a9bdb53740}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'Pictures', 'PrintHood', 'Recent', 'run_batch_score.bat', 'Saved Games', 'scored_apps.csv', 'scored_full_dataset.xlsx'

In [62]:
import joblib
import pandas as pd

# 1. Load pipeline & feature list
pipeline     = joblib.load("xgb_credit_risk_pipeline_final.pkl")
df           = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")
feature_cols = df.drop("allow", axis=1).columns.tolist()

# 2. Start from a “base” of zeros/modes so you don’t have to set 50+ fields manually
num_cols = df[feature_cols].select_dtypes(include='number').columns
base_dummy = {
    c: (0 if c in num_cols else df[c].mode()[0])
    for c in feature_cols
}

# 3. Now *override* with the exact dummy values you want to test
dummy = base_dummy.copy()
dummy.update({
    # Numeric features
    "calculated_premium": 2500,
    "quote_hour":            14,
    "quote_dayofweek":        1,   # Tuesday
    "quote_month":            3,   # March
    "half_excess":          500,   # example deductible
    "excess":              1000,
    # One-hot flags (set the relevant ones to 1)
    "boat_type_HBP":           1,
    "storage_method_SRE":      0,
    # … you can flip any other one-hot here as needed …
})

# 4. Build DataFrame & predict
X_dummy = pd.DataFrame([dummy], columns=feature_cols)
pred     = pipeline.predict(X_dummy)[0]
proba    = pipeline.predict_proba(X_dummy)[0][1]

print(f"Dummy input → class={pred}  (1=approved, 0=declined),  P(approve)={proba:.3f}")


Dummy input → class=0  (1=approved, 0=declined),  P(approve)=0.027


In [63]:
import joblib
import pandas as pd

# 1. Load pipeline & data
pipeline   = joblib.load("xgb_credit_risk_pipeline_final.pkl")
df         = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")

# 2. Pick a row by index 
idx = 30
row = df.iloc[idx]

# 3. Extract its features (drop the target)
X_new = row.drop("allow").to_frame().T

# 4. Display the raw feature values
print("Features for row", idx)
print(X_new.to_string(index=False))

# 5. Predict
pred_class = pipeline.predict(X_new)[0]
pred_proba = pipeline.predict_proba(X_new)[0][1]

print(f"\nModel prediction for row {idx}:")
print(f"  → Predicted class: {pred_class}  (1=approved, 0=declined)")
print(f"  → P(approve): {pred_proba:.3f}")


Features for row 30
calculated_premium annual_premium stamp_duty fee_gst   total excess discount_total half_excess category storage_postcode purchase_date sum_insured year_built pro_built water_skiers previous_claims quote_hour quote_dayofweek quote_month power_speed_category_Low boat_type_CRU boat_type_HBP boat_type_LAU boat_type_PON boat_type_RUN boat_type_SKI hull_material_CBF hull_material_FIB hull_material_KEV hull_material_PLA hull_material_RUB hull_material_STE hull_material_TIM storage_method_COM storage_method_DRY storage_method_FAA storage_method_GAR storage_method_HAR storage_method_JET storage_method_MAR storage_method_PON storage_method_SRE storage_method_SWI storage_state_NSW storage_state_NT storage_state_QLD storage_state_SA storage_state_TAS storage_state_VIC storage_state_WA underwriter_id_t4b354839eb36 cover_type_Comprehensive
             526.6         500.27      49.53    5.27  657.76   1500          50.03           3    power             2330          2021       5

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Load and split your data
df = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")
X  = df.drop("allow", axis=1)
y  = df["allow"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

# 2. Re-create your preprocessing + model pipeline
numeric_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols     = [c for c in X.columns if c not in numeric_cols]

preprocessor = ColumnTransformer([
    ("num", MinMaxScaler(), numeric_cols),
    ("cat", OneHotEncoder(drop="first", sparse_output=False), cat_cols),
])

pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", XGBClassifier(eval_metric="logloss", random_state=42)),
])

# 3. Fit only on the TRAIN split
pipeline.fit(X_train, y_train)

# 4. Evaluate on the untouched TEST split
y_pred = pipeline.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))




Test accuracy: 0.8376623376623377

Classification report:
               precision    recall  f1-score   support

           0       0.78      0.62      0.69        45
           1       0.86      0.93      0.89       109

    accuracy                           0.84       154
   macro avg       0.82      0.77      0.79       154
weighted avg       0.83      0.84      0.83       154



In [65]:
import joblib
import pandas as pd
import numpy as np

# Load pipeline & data
pipeline = joblib.load("xgb_credit_risk_pipeline_final.pkl")
df       = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")

# First 30 rows
X_first30 = df.drop("allow", axis=1).iloc[:30]
y_first30 = df["allow"].iloc[:30]

# Predictions
preds  = pipeline.predict(X_first30)
probas = pipeline.predict_proba(X_first30)[:,1]

# Compile results
results = pd.DataFrame({
    "allow_actual":        y_first30.values,
    "predicted_class":     preds,
    "approve_probability": np.round(probas, 3)
}, index=y_first30.index)

print(results)


    allow_actual  predicted_class  approve_probability
0              1                1                0.997
1              0                0                0.012
2              0                0                0.004
3              1                1                1.000
4              1                1                0.978
5              1                1                0.918
6              1                1                0.998
7              1                1                0.973
8              1                1                0.981
9              0                0                0.119
10             0                0                0.119
11             1                1                0.786
12             1                1                0.991
13             0                0                0.056
14             1                1                0.994
15             0                0                0.081
16             1                1                0.900
17        

In [66]:
import joblib
import pandas as pd
import numpy as np

# 1. Load your trained pipeline and the cleaned dataset
pipeline = joblib.load("xgb_credit_risk_pipeline_final.pkl")
df       = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")

# 2. Extract features and true labels for rows 0–29
X_first30 = df.drop("allow", axis=1).iloc[:30]
y_first30 = df["allow"].iloc[:30]

# 3. Predict
pred_classes = pipeline.predict(X_first30)
pred_probs   = pipeline.predict_proba(X_first30)[:, 1]

# 4. Compile and display
results = pd.DataFrame({
    "allow_actual":        y_first30.values,
    "predicted_class":     pred_classes,
    "approve_probability": np.round(pred_probs, 3)
}, index=X_first30.index)

print("First 30 rows: actual vs predicted\n")
print(results.to_string())


First 30 rows: actual vs predicted

    allow_actual  predicted_class  approve_probability
0              1                1                0.997
1              0                0                0.012
2              0                0                0.004
3              1                1                1.000
4              1                1                0.978
5              1                1                0.918
6              1                1                0.998
7              1                1                0.973
8              1                1                0.981
9              0                0                0.119
10             0                0                0.119
11             1                1                0.786
12             1                1                0.991
13             0                0                0.056
14             1                1                0.994
15             0                0                0.081
16             1             

In [67]:
import joblib
import pandas as pd

# 1. Load your model and full dataset
pipeline = joblib.load("xgb_credit_risk_pipeline_final.pkl")
df_full  = pd.read_excel("Cargo_final_timestamp_speed_encoded.xlsx")

# 2. Score all rows at once
features = df_full.drop("allow", axis=1)
df_full["predicted_class"]     = pipeline.predict(features)
df_full["approve_probability"] = pipeline.predict_proba(features)[:, 1]

# 3. Save the scored file
df_full.to_excel("scored_full_dataset.xlsx", index=False)
print(f"Scored {len(df_full)} rows → scored_full_dataset.xlsx")




Scored 767 rows → scored_full_dataset.xlsx
