In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import lightgbm as lgb

# --- Load Data ---
orders_data_path = "C:/Users/parij/OneDrive/Desktop/git_practice/UnORG_supply_chain_ml/data/order_data_last_six_month.xlsx"
items_data_path = "C:/Users/parij/OneDrive/Desktop/git_practice/UnORG_supply_chain_ml/data/associated_order_item_data_last_six_month.xlsx"

orders_df = pd.read_excel(orders_data_path)
items_df = pd.read_excel(items_data_path)

# --- Preprocessing ---
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'], dayfirst=True)

# Base customer-date grid
customer_orders = orders_df[['customer_id', 'order_date']].copy()
customer_orders['order_placed'] = 1

min_date = orders_df['order_date'].min()
max_date = orders_df['order_date'].max()
date_range = pd.date_range(min_date, max_date)
customers = customer_orders['customer_id'].unique()

grid = pd.MultiIndex.from_product([customers, date_range], names=["customer_id", "order_date"])
full_df = pd.DataFrame(index=grid).reset_index()
full_df = full_df.merge(customer_orders, on=["customer_id", "order_date"], how="left")
full_df['order_placed'] = full_df['order_placed'].fillna(0).astype(int)

# --- Rolling Features ---
full_df = full_df.sort_values(by=['customer_id', 'order_date'])

# Compute days since last order safely and correctly
last_order_tracker = full_df.groupby('customer_id')['order_placed'].transform(
    lambda x: x.ne(0).cumsum().where(x == 1)
)
full_df['days_since_last_order'] = (
    last_order_tracker.groupby(full_df['customer_id']).ffill().groupby(full_df['customer_id']).cumcount()
)
full_df['days_since_last_order'] = full_df['days_since_last_order'].fillna(999)


full_df['orders_past_7d'] = full_df.groupby('customer_id')['order_placed'].transform(lambda x: x.rolling(7).sum())
full_df['orders_past_14d'] = full_df.groupby('customer_id')['order_placed'].transform(lambda x: x.rolling(14).sum())
full_df.fillna({'orders_past_7d': 0, 'orders_past_14d': 0}, inplace=True)

# --- Date Features ---
full_df['day_of_week'] = full_df['order_date'].dt.dayofweek
full_df['is_weekend'] = full_df['day_of_week'].isin([5, 6]).astype(int)
full_df['month'] = full_df['order_date'].dt.month
full_df['day'] = full_df['order_date'].dt.day

# --- Cumulative Behavior ---
full_df['cumulative_orders'] = full_df.groupby('customer_id')['order_placed'].cumsum()
order_counts = full_df.groupby('customer_id')['order_placed'].sum()
active_days = full_df.groupby('customer_id').size()
full_df['avg_order_frequency'] = full_df['customer_id'].map((order_counts / active_days).to_dict())

# --- Merge Items with Orders ---
orders_items_merged = items_df.merge(orders_df[['order_id', 'customer_id', 'order_date']], on='order_id', how='left')
orders_items_merged['order_date'] = pd.to_datetime(orders_items_merged['order_date'], dayfirst=True)

# Most common item per customer
most_common_item = (
    orders_items_merged.groupby(['customer_id', 'item_name'])
    .size()
    .reset_index(name='count')
    .sort_values(['customer_id', 'count'], ascending=[True, False])
    .drop_duplicates('customer_id')
    .set_index('customer_id')['item_name']
    .to_dict()
)
full_df['top_item'] = full_df['customer_id'].map(most_common_item)
le = LabelEncoder()
full_df['top_item_encoded'] = le.fit_transform(full_df['top_item'].fillna('Unknown'))

# Average days between orders
order_dates = orders_df.sort_values(by='order_date')[['customer_id', 'order_date']]
avg_days_between = (
    order_dates.groupby('customer_id')['order_date']
    .apply(lambda x: x.diff().dt.days.dropna().mean())
    .to_dict()
)
full_df['avg_days_between_orders'] = full_df['customer_id'].map(avg_days_between)

# Distinct items ordered
distinct_items = (
    orders_items_merged.groupby('customer_id')['item_name']
    .nunique()
    .to_dict()
)
full_df['distinct_items_ordered'] = full_df['customer_id'].map(distinct_items)

# Fill final NaNs
full_df.fillna({
    'avg_days_between_orders': 999,
    'distinct_items_ordered': 0
}, inplace=True)

# --- Prepare for Modeling ---
feature_cols = [
    'days_since_last_order', 'orders_past_7d', 'orders_past_14d',
    'day_of_week', 'is_weekend', 'month', 'day',
    'cumulative_orders', 'avg_order_frequency',
    'top_item_encoded', 'avg_days_between_orders',
    'distinct_items_ordered'
]

X = full_df[feature_cols]
y = full_df['order_placed']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# --- Train Model LightGBM Classifier ---
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

# --- Evaluate ---
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Optional: Predict probabilities for next 14 days (you can extend this as needed)


**Saves the Model Params**

In [None]:
import joblib

# Save the Model
joblib.dump(clf, 'C:/Users/parij/OneDrive/Desktop/git_practice/UnORG_supply_chain_ml/models/Order_identification.pkl')

**Report for various Classification Models**

Gives classification report for various classification model.



1.   Random Forest
2.   Logistic Regression
3.   KNN
4.   XGBoost
5.   LightGBM  

The Best one among these 5 is ***LightGBM*** which gives an accuracy of 0.933. So, we are using LightGBM.



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import lightgbm as lgb

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Logistic Regression': LogisticRegression(max_iter=10000, class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier()
}

for name, model in models.items():
    print(f"\n Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f" Results for {name}:")
    print(classification_report(y_test, y_pred, digits=3))
    print("-" * 60)

**Generates Probability for next 14 Days**

Gives a CSV file including each customer_id and their respective probabilities of ordering the products in next 14 days.

This CSV is saved as "order_probability_next_14_days.csv"



In [None]:
# Get latest known features per customer
latest_features = full_df.sort_values("order_date").groupby("customer_id").tail(1).set_index("customer_id")

# Generate next 14 days
from datetime import timedelta
future_dates = pd.date_range(full_df['order_date'].max() + timedelta(days=1), periods=14)
customers = latest_features.index.unique()

# Step 3: Create customer Ã— date grid
future_grid = pd.MultiIndex.from_product([customers, future_dates], names=["customer_id", "order_date"]).to_frame(index=False)

# Step 4: Copy static features from latest record
for feature in [
    'days_since_last_order', 'orders_past_7d', 'orders_past_14d',
    'cumulative_orders', 'avg_order_frequency',
    'top_item_encoded', 'avg_days_between_orders',
    'distinct_items_ordered'
]:
    future_grid[feature] = future_grid['customer_id'].map(latest_features[feature])

# Step 5: Add time-based features based on order_date
future_grid['day_of_week'] = future_grid['order_date'].dt.dayofweek
future_grid['is_weekend'] = future_grid['day_of_week'].isin([5, 6]).astype(int)
future_grid['month'] = future_grid['order_date'].dt.month
future_grid['day'] = future_grid['order_date'].dt.day

# Step 6: Predict
X_future = future_grid[feature_cols]  # same features used for training
future_grid['order_probability'] = clf.predict_proba(X_future)[:, 1]

# Step 7: Save only desired output
final_output = future_grid[['customer_id', 'order_date', 'order_probability']]
final_output.to_csv("data/order_probability_next_14_days.csv", index=False)

# Display sample
final_output.head(10)
