In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
file_id = "1EZ5r0bkx3rDzVU9i1Vsc_ySwRPfj3KLJ"
url = f"https://drive.google.com/uc?id={file_id}"
df = pd.read_csv(url)

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.rename(columns={'Shortage Qty ': 'Shortage_Qty'}, inplace=True)

In [None]:
df.columns

In [None]:
def preprocess_shortage_data(df):

    processed_df = df.copy()

    required_columns = ['Date', 'Item', 'Shortage_Qty']
    if not all(col in processed_df.columns for col in required_columns):
        raise ValueError(f"DataFrame must contain columns: {required_columns}")

    processed_df['Date'] = pd.to_datetime(processed_df['Date'])

    min_date = processed_df['Date'].min()
    max_date = processed_df['Date'].max()

    date_range = pd.date_range(start=min_date, end=max_date)

    unique_items = processed_df['Item'].unique()

    full_df = pd.DataFrame([(item, date) for item in unique_items for date in date_range],columns=['Item', 'Date'])

    full_df = full_df.merge(processed_df, on=['Item', 'Date'], how='left')

    full_df['Shortage_Qty'] = full_df['Shortage_Qty'].fillna(0)

    full_df['is_shortage'] = (full_df['Shortage_Qty'] > 0).astype(int)

    item_shortage_stats = full_df.groupby('Item').agg({
        'is_shortage': 'mean',
        'Shortage_Qty': [
            'count',
            'mean',
            'max',
            'sum'
        ]
    }).reset_index()

    item_shortage_stats.columns = [
        'Item',
        'historical_shortage_prob',
        'total_observations',
        'avg_shortage_qty',
        'max_shortage_qty',
        'total_shortage_qty'
    ]

    full_df = full_df.merge(item_shortage_stats, on='Item', how='left')

    def generate_time_features(group):

        group = group.sort_values('Date')

        group['shortage_qty_7d_avg'] = group['Shortage_Qty'].rolling(window=7, min_periods=1).mean().shift(1)
        group['shortage_qty_30d_avg'] = group['Shortage_Qty'].rolling(window=30, min_periods=1).mean().shift(1)
        group['shortage_freq_7d'] = group['is_shortage'].rolling(window=7, min_periods=1).mean().shift(1)
        group['shortage_freq_30d'] = group['is_shortage'].rolling(window=30, min_periods=1).mean().shift(1)

        return group

    full_df = full_df.groupby('Item', group_keys=False).apply(generate_time_features)

    full_df['day_of_week'] = full_df['Date'].dt.dayofweek
    full_df['month'] = full_df['Date'].dt.month
    full_df['quarter'] = full_df['Date'].dt.quarter
    full_df['year'] = full_df['Date'].dt.year
    full_df['is_weekend'] = full_df['day_of_week'].isin([5, 6]).astype(int)

    le = LabelEncoder()
    full_df['item_encoded'] = le.fit_transform(full_df['Item'])

    features = [
        'item_encoded',
        'day_of_week',
        'month',
        'quarter',
        'year',
        'is_weekend',
        'total_observations',
        'historical_shortage_prob',
        'avg_shortage_qty',
        'max_shortage_qty',
        'total_shortage_qty',
        'shortage_qty_7d_avg',
        'shortage_qty_30d_avg',
        'shortage_freq_7d',
        'shortage_freq_30d'
    ]

    full_df.dropna(subset=features + ['is_shortage'], inplace=True)
    min_len = min(len(full_df[features]), len(full_df['is_shortage']))
    x, y = full_df[features].iloc[:min_len], full_df['is_shortage'].iloc[:min_len]


    return full_df[features], full_df['is_shortage'], le

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve, confusion_matrix, classification_report


In [None]:
def train_shortage_models(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    models = {
        # 'Logistic Regression': LogisticRegression(max_iter=1000),
        # 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        # 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=100)
    }

    results = {}

    for name, model in models.items():

        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1]

        results[name] = {
            'model': model,
            'auc': roc_auc_score(y_test, y_proba),
            'accuracy': accuracy_score(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred),
            'classification_report': classification_report(y_test, y_pred),
            'probabilities': y_proba
        }

        print(f"\n{name} Performance:")
        print(f"AUC Score: {results[name]['auc']:.4f}")
        print("Classification Report:")
        print(results[name]['classification_report'])

    plt.figure(figsize=(10, 6))
    for name, result in results.items():
        precision, recall, _ = precision_recall_curve(y_test, result['probabilities'])
        plt.plot(recall, precision, label=f'{name} (AUC = {result["auc"]:.2f})')

    plt.title('Precision-Recall Curves')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.show()

    return results

In [None]:
x, y, label_encoder = preprocess_shortage_data(df)

In [None]:
print(x.shape)
print(y.shape)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

In [None]:
results = train_shortage_models(x, y)



---






In [None]:
features_df = x.reset_index(drop=True)
target_df = y.reset_index(drop=True)
concatenated_df = pd.concat([features_df, target_df], axis=1)

In [None]:
concatenated_df.tail(50)

In [None]:
columns_to_keep = ["item_encoded", "total_observations", "historical_shortage_prob", "avg_shortage_qty","max_shortage_qty", "total_shortage_qty", "shortage_qty_7d_avg","shortage_qty_30d_avg", "shortage_freq_7d", "shortage_freq_30d","is_shortage"]

In [None]:
df_sorted = concatenated_df.sort_values(by=["year", "quarter", "month", "day_of_week"], ascending=False)

df_latest = df_sorted.drop_duplicates(subset=["item_encoded"])

df_fixed_values= df_latest[columns_to_keep]

In [None]:
df_fixed_values

In [None]:
# from google.colab import files
# df_fixed_values.to_csv("fixed_values.csv", index=False)

In [None]:
import numpy as np

def create_sequences_fast(df, target_col, seq_features, static_features, seq_length=30):
    """Efficiently create sequences for time-series modeling using NumPy operations."""

    # Convert DataFrame columns to NumPy arrays (MUCH FASTER than using .iloc[] in loop)
    seq_data = df[seq_features].values  # Shape: (num_samples, num_seq_features)
    static_data = df[static_features].values
    labels_data = df[target_col].values

    num_samples = len(df) - seq_length
    num_seq_features = len(seq_features)
    num_static_features = len(static_features)

    # Pre-allocate NumPy arrays
    sequences = np.zeros((num_samples, seq_length, num_seq_features), dtype=np.float32)
    static_inputs = static_data[seq_length:]  # Faster slicing instead of looping
    labels = labels_data[seq_length:]

    # Vectorized slicing (avoids Python loops)
    for i in range(num_samples):
        sequences[i] = seq_data[i : i + seq_length]

    return sequences, static_inputs, labels
seq_features = ['shortage_qty_7d_avg','shortage_qty_30d_avg','shortage_freq_7d','shortage_freq_30d']
static_features = ['item_encoded','day_of_week','month','quarter','year','is_weekend','total_observations','historical_shortage_prob','avg_shortage_qty','max_shortage_qty','total_shortage_qty']
# Call optimized function
X_seq, X_static, y = create_sequences_fast(concatenated_df, target_col="is_shortage", seq_features=seq_features, static_features=static_features)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Concatenate
from tensorflow.keras.models import Model

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y),
    y=y
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
y = np.array(y, dtype=np.float32).reshape(-1, 1)  # Ensure correct shape

# Train model with class weights
model.fit([X_seq, X_static], y, epochs=3, batch_size=32, validation_split=0.2, class_weight=class_weight_dict)


In [None]:
# Get model predictions
y_pred_probs = model.predict([X_seq, X_static])  # Probabilities
y_pred = (y_pred_probs > 0.5).astype(int)  # Convert to binary labels

# Filter cases where the actual value is 1
y_true_shortages = Y[Y == 1]
y_pred_shortages = y_pred[Y == 1]
y_pred_probs_shortages = y_pred_probs[Y == 1]  # Probabilities for ROC AUC

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_shortage = precision_score(y_true_shortages, y_pred_shortages)
recall_shortage = recall_score(y_true_shortages, y_pred_shortages)

print(f"ðŸ”¹ Precision (Shortages): {precision_shortage:.4f}")
print(f"ðŸ”¹ Recall (Shortages): {recall_shortage:.4f}")

from sklearn.metrics import f1_score

f1_shortage = f1_score(y_true_shortages, y_pred_shortages)
print(f"ðŸ”¹ F1 Score (Shortages): {f1_shortage:.4f}")

from sklearn.metrics import roc_auc_score

auc_shortage = roc_auc_score(y_true_shortages, y_pred_probs_shortages)
print(f"ðŸ”¹ AUC (Shortages): {auc_shortage:.4f}")



In [None]:
from xgboost import XGBClassifier

model_high_precision = XGBClassifier(
    scale_pos_weight=0.5,
    max_depth=6,
    learning_rate=0.05,
    eval_metric="aucpr"
)

In [None]:
model_high_precision.fit(x_train, y_train)

In [None]:
from sklearn.metrics import precision_score
y_pred_probs = model_high_precision.predict_proba(x_test)[:, 1]
y_pred = (y_pred_probs > 0.8).astype(int)  # Set a high threshold for precision
precision_shortage = precision_score(y_test, y_pred)
print(f"ðŸ”¹ Precision (Shortages): {precision_shortage:.4f}")

In [None]:
mask = (y_test == 1)  # Boolean mask where true labels are 1
y_true_1 = y_test[mask]
y_pred_1 = y_pred[mask]

In [None]:
from sklearn.metrics import precision_score

precision_for_1 = precision_score(y_true_1, y_pred_1)
print(f"Precision when y=1: {precision_for_1:.4f}")

In [None]:
# import joblib

# # Save the model
# joblib.dump(model_high_precision, "xgb_model_precision.pkl")

In [None]:
# Ensure Y has the correct length
X_seq, X_static, Y = create_sequences_fast(concatenated_df, target_col="is_shortage",seq_features=seq_features, static_features=static_features)

print(f"Fixed Shapes -> X_seq: {X_seq.shape}, X_static: {X_static.shape}, Y: {Y.shape}")

In [None]:
X_seq_train, X_seq_test, X_static_train, X_static_test, y_train, y_test = train_test_split(X_seq, X_static, y, test_size=0.2, random_state=40, stratify=y)

In [None]:
seq_input = Input(shape=(30, len(seq_features)), name="sequence_input")
X = LSTM(64, return_sequences=True)(seq_input)
X = Dropout(0.2)(X)
X = LSTM(32)(X)

# Static Input (Non-Time-Series Features)
static_input = Input(shape=(len(static_features),), name="static_input")
Y_layer = Dense(32, activation="relu")(static_input)  # Static features go through a Dense layer

# Concatenation of Features
combined = Concatenate()([X, Y_layer])  # Concatenating features, not labels
output = Dense(1, activation="sigmoid")(combined)  # Binary classification output

# Model Compilation
model = Model(inputs=[seq_input, static_input], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["AUC", "Precision", "Recall"])

# Model Summary
model.summary()

In [None]:
model_high_recall = Model(inputs=[seq_input, static_input], outputs=output)
model_high_recall.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["AUC", "Precision", "Recall"]
)

class_weight = {0: 1, 1: 100}  # Higher weight for the minority class
model_high_recall.fit([X_seq_train, X_static_train], y_train, epochs=5, batch_size=32, validation_split=0.2, class_weight=class_weight)

In [None]:
model_high_recall.save("model_high_recall.h5")

In [None]:
 from sklearn.metrics import recall_score

# Get predictions on training data
y_pred_probs = model_high_recall.predict([X_seq_test, X_static_test])  # Probabilities
y_pred = (y_pred_probs > 0.3).astype(int)  # Convert to binary labels with threshold 0.5

# Filter only cases where actual y == 1
mask = (y_test == 1)
y_true_1 = y_test[mask]
y_pred_1 = y_pred[mask]

# Compute recall
recall_for_1 = recall_score(y_true_1, y_pred_1)
print(f"Recall when y=1: {recall_for_1:.4f}")


In [None]:
import tensorflow.keras.backend as K

def focal_loss(alpha=0.25, gamma=2.0):
    def loss(y_true, y_pred):
        y_true = K.cast(y_true, K.floatx())
        bce = K.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        return K.mean(alpha * (1 - p_t) ** gamma * bce)
    return loss

In [None]:
model_f1 = Model(inputs=[seq_input, static_input], outputs=output)
model_f1.compile(
    optimizer="adam",
    loss=focal_loss(alpha=0.25, gamma=2.0),
    metrics=["AUC", "Precision", "Recall"]
)
model_f1.fit([X_seq_train, X_static_train], y_train, epochs=5, batch_size=32, validation_split=0.2)

In [None]:
# Get predictions on training data
y_pred_probs = model_f1.predict([X_seq_test, X_static_test])  # Probabilities
y_pred = (y_pred_probs > 0.5).astype(int)  # Convert to binary labels with threshold 0.5

# Filter only cases where actual y == 1
mask = (y_test == 1)
y_true_1 = y_test[mask]
y_pred_1 = y_pred[mask]

In [None]:
from sklearn.metrics import recall_score,precision_score
recall_for_1 = recall_score(y_true_1, y_pred_1)
precision_for_1=precision_score(y_true_1, y_pred_1)
print(f"Recall when y=1: {recall_for_1:.4f}")
print(f"Precision when y=1: {precision_for_1:.4f}")

In [None]:
from tensorflow.keras.models import load_model
import joblib
import numpy as np
from sklearn.metrics import classification_report

# Load models
recall_model = load_model('/model_high_recall.h5')
precision_model = joblib.load("/xgb_model_precision.pkl")  # XGBClassifier

In [None]:
features_df = x.reset_index(drop=True)
target_df = y.reset_index(drop=True)
concatenated_df = pd.concat([features_df, target_df], axis=1)

In [None]:
concatenated_df.columns

In [None]:
def create_sequences_fast_2(df, target_col, seq_features, static_features, seq_length=30):
    """Efficiently create sequences for LSTM and align with static model input (like XGBoost)."""

    # Convert DataFrame columns to NumPy arrays
    seq_data = df[seq_features].values
    static_data = df[static_features].values
    labels_data = df[target_col].values

    num_samples = len(df) - seq_length
    num_seq_features = len(seq_features)

    # Pre-allocate arrays
    sequences = np.zeros((num_samples, seq_length, num_seq_features), dtype=np.float32)
    static_inputs = static_data[seq_length:]   # Aligned static inputs
    labels = labels_data[seq_length:]          # Aligned targets

    # Vectorized sequence slicing
    for i in range(num_samples):
        sequences[i] = seq_data[i : i + seq_length]

    return sequences, static_inputs, labels

In [None]:
seq_features = ['shortage_qty_7d_avg','shortage_qty_30d_avg','shortage_freq_7d','shortage_freq_30d']
static_features = ['item_encoded','day_of_week','month','quarter','year','is_weekend','total_observations','historical_shortage_prob','avg_shortage_qty','max_shortage_qty','total_shortage_qty']

In [None]:
X_seq, X_static, y_aligned = create_sequences_fast_2(concatenated_df, "is_shortage",seq_features, static_features, 30)

In [None]:
X_static.shape

In [None]:
# Calculate the number of valid samples generated from sequence creation
seq_length = 30  # Or whatever you used
start_idx = seq_length  # The first (seq_length) rows are dropped during sequence creation

# Extract aligned rows from the original DataFrame
x_xgb_df = concatenated_df.iloc[start_idx:][xgb_features].reset_index(drop=True)

# Confirm shape: should be the same as X_seq, X_static, y_aligned
assert len(x_xgb_df) == len(X_static), f"Shape mismatch: {len(x_xgb_df)} != {len(X_static)}"

# Convert to NumPy
x_xgb = x_xgb_df.values  # Shape: (1642221, 15)

In [None]:
from sklearn.model_selection import train_test_split

indices = np.arange(len(x_xgb))  # Should be 1642221
train_idx, test_idx = train_test_split(indices, test_size=0.2, stratify=y_aligned, random_state=42)

# Split for XGBoost
x_xgb_train = x_xgb[train_idx]
x_xgb_test = x_xgb[test_idx]

# Split for LSTM
X_seq_train, X_seq_test = X_seq[train_idx], X_seq[test_idx]
X_static_train, X_static_test = X_static[train_idx], X_static[test_idx]

# Labels
y_train, y_test = y_aligned[train_idx], y_aligned[test_idx]

In [None]:
proba_recall = recall_model.predict([X_seq_test, X_static_test], batch_size=512)

In [None]:
proba_precision = precision_model.predict_proba(x_xgb_test)[:, 1]

In [None]:
combined_proba = (proba_recall.flatten() + proba_precision) / 2

final_pred = (combined_proba > 0.60).astype(int)

print(classification_report(y_test, final_pred, digits=4))

In [None]:
from tensorflow.keras.models import load_model

model = load_model("/model_high_recall.h5")

In [None]:
X_seq, X_static, Y = create_sequences_fast(concatenated_df, target_col="is_shortage",seq_features=seq_features, static_features=static_features)

In [None]:
y_pred_probs = model.predict([X_seq, X_static])

In [None]:
from sklearn.metrics import classification_report

thresholds = [0.4,0.41,0.42,0.43,0.44,0.45,0.451,0.46,0.47,0.48,0.49,0.5]

for thresh in thresholds:
    print(f"\n--- Threshold = {thresh} ---")
    y_pred_class = (y_pred_probs > thresh).astype(int)
    print(classification_report(y, y_pred_class, digits=4))