In [1]:
# ===============================
# Cell 1 — Load & Clean
# ===============================
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

SEED = 42
TRAIN_PATH = r'C:\Users\vikym\Desktop\ctd\train.csv'
TEST_PATH  = r'C:\Users\vikym\Desktop\ctd\test.csv'

print("Loading...")
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

# Parse dates
date_columns = ['CONSTRUCTION_START_DATE', 'SUBSTANTIAL_COMPLETION_DATE', 'invoiceDate']
for df in (train, test):
    for c in date_columns:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors='coerce')

# Drop high-missing example cols if present
for c in ['NUMROOMS','NUMBEDS']:
    if c in train and train[c].isnull().mean() > 0.5:
        train.drop(columns=c, inplace=True, errors='ignore')
        test.drop(columns=c, inplace=True, errors='ignore')

# Cleaning helpers
def clean_numeric_column(s, clip_negative=True, replace_zero_epsilon=False, winsorize=False):
    if s.dtype == 'object':
        s = (s.str.replace(',', '')
               .str.replace('$','')
               .str.replace(' ','')
               .str.replace(r'[^\d\.\-]','', regex=True))
        s = pd.to_numeric(s, errors='coerce')
    if clip_negative and s.name == 'QtyShipped':
        s = s.clip(lower=0)
    if replace_zero_epsilon and s.name in ['UnitPrice','ExtendedPrice']:
        s = s.where(s > 0, 0.01)
    if winsorize:
        lo, hi = s.quantile(0.01), s.quantile(0.99)
        s = s.clip(lo, hi)
    return s

def clean_text_column(s):
    if s.dtype == 'object':
        s = (s.str.strip()
               .str.replace('\n',' ')
               .str.replace('\r',' ')
               .str.lower()
               .str.replace(r'[^\w\s]',' ', regex=True)
               .str.replace(r'\s+',' ', regex=True))
    return s

# Clean numeric & text
for c in ['ExtendedQuantity','QtyShipped','UnitPrice','ExtendedPrice','invoiceTotal']:
    if c in train: train[c] = clean_numeric_column(train[c], clip_negative=(c=='QtyShipped'),
                                                   replace_zero_epsilon=(c in ['UnitPrice','ExtendedPrice']),
                                                   winsorize=True)
    if c in test:  test[c]  = clean_numeric_column(test[c],  clip_negative=(c=='QtyShipped'),
                                                   replace_zero_epsilon=(c in ['UnitPrice','ExtendedPrice']),
                                                   winsorize=True)
for c in ['ItemDescription','PROJECT_CITY','STATE','PROJECT_COUNTRY','CORE_MARKET','PROJECT_TYPE']:
    if c in train: train[c] = clean_text_column(train[c])
    if c in test:  test[c]  = clean_text_column(test[c])

# Targets & splits
train['MasterItemNo'] = pd.to_numeric(train['MasterItemNo'], errors='coerce')
train_c = train[train['id'] < 13422].copy()
train_c = train_c.dropna(subset=['MasterItemNo'])
train_r = train_c.dropna(subset=['QtyShipped']).copy()

# Save cleaned
train.to_csv('clean_train_full.csv', index=False)
test.to_csv('clean_test_full.csv', index=False)
train_c.to_csv('clean_train_c.csv', index=False)
train_r.to_csv('clean_train_r.csv', index=False)

print("Clean ✓")

Loading...
Clean ✓


In [2]:
# ===============================
# Cell 2 — Feature Engineering
# ===============================
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

print("Featurizing...")

train_c = pd.read_csv('clean_train_c.csv')
train_r = pd.read_csv('clean_train_r.csv')
test    = pd.read_csv('clean_test_full.csv')

# 1) TF-IDF
all_desc = pd.concat([train_c['ItemDescription'], test['ItemDescription']]).fillna('missing')
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2), min_df=2, stop_words='english', dtype=np.float32)
tfidf.fit(all_desc)

X_text_train   = tfidf.transform(train_c['ItemDescription'].fillna('missing'))
X_text_test    = tfidf.transform(test['ItemDescription'].fillna('missing'))
X_text_train_r = tfidf.transform(train_r['ItemDescription'].fillna('missing'))

# 2) Numeric (log1p + impute)
num_feats = ['ExtendedQuantity','UnitPrice','ExtendedPrice','invoiceTotal']
for df in (train_c, train_r, test):
    for c in num_feats:
        if c in df:
            df[c] = df[c].where(df[c] > 0, 0.01)
            df[c] = np.log1p(df[c])

num_imp = SimpleImputer(strategy='median')
X_num_train   = num_imp.fit_transform(train_c[num_feats])
X_num_test    = num_imp.transform(test[num_feats])
X_num_train_r = num_imp.transform(train_r[num_feats])

# 3) Dates → engineered & imputed
date_cols = ['CONSTRUCTION_START_DATE','SUBSTANTIAL_COMPLETION_DATE','invoiceDate']
for df in (train_c, train_r, test):
    for c in date_cols:
        if c in df: df[c] = pd.to_datetime(df[c], errors='coerce')

def date_feats(df):
    out = {}
    if 'CONSTRUCTION_START_DATE' in df and 'SUBSTANTIAL_COMPLETION_DATE' in df:
        out['construction_duration_days'] = (df['SUBSTANTIAL_COMPLETION_DATE'] - df['CONSTRUCTION_START_DATE']).dt.days
    if 'invoiceDate' in df:
        out['invoice_year']       = df['invoiceDate'].dt.year
        out['invoice_month']      = df['invoiceDate'].dt.month
        out['invoice_day']        = df['invoiceDate'].dt.day
        out['invoice_dayofweek']  = df['invoiceDate'].dt.dayofweek
        out['invoice_quarter']    = df['invoiceDate'].dt.quarter
    return pd.DataFrame(out)

date_imp = SimpleImputer(strategy='median')
X_date_train   = date_imp.fit_transform(date_feats(train_c))
X_date_test    = date_imp.transform(date_feats(test))
X_date_train_r = date_imp.transform(date_feats(train_r))

# 4) Categorical (top-k one-hot)
cat_cols = ['PROJECT_CITY','STATE','PROJECT_COUNTRY','CORE_MARKET','PROJECT_TYPE','UOM']

def one_hot_topk(df, topk=20):
    mats, maps = [], {}
    for c in cat_cols:
        if c in df:
            top = df[c].value_counts().head(topk).index.tolist()
            maps[c] = top
            enc = df[c].fillna('missing').apply(lambda x: x if x in top else 'other')
            cols = [ (enc == t).astype(np.int8).to_numpy() for t in (top + ['other']) ]
            mats.append(np.column_stack(cols))
    return (np.column_stack(mats) if mats else np.empty((len(df),0))), maps

X_cat_train, cat_map = one_hot_topk(train_c)
def apply_one_hot(df, cat_map):
    mats = []
    for c in cat_cols:
        if c in df:
            top = cat_map.get(c, [])
            enc = df[c].fillna('missing').apply(lambda x: x if x in top else 'other')
            mats.append(np.column_stack([(enc == t).astype(np.int8).to_numpy() for t in (top + ['other'])]))
    return np.column_stack(mats) if mats else np.empty((len(df),0))

X_cat_test    = apply_one_hot(test, cat_map)
X_cat_train_r = apply_one_hot(train_r, cat_map)

# 5) Stack (sparse)
X_class_train = sparse.hstack([X_text_train,   sparse.csr_matrix(X_num_train),   sparse.csr_matrix(X_date_train),   sparse.csr_matrix(X_cat_train)]).tocsr()
X_class_test  = sparse.hstack([X_text_test,    sparse.csr_matrix(X_num_test),    sparse.csr_matrix(X_date_test),    sparse.csr_matrix(X_cat_test)]).tocsr()
X_reg_train   = sparse.hstack([X_text_train_r, sparse.csr_matrix(X_num_train_r), sparse.csr_matrix(X_date_train_r), sparse.csr_matrix(X_cat_train_r)]).tocsr()
X_reg_test    = X_class_test.copy()

y_class = train_c['MasterItemNo'].to_numpy()
y_reg   = train_r['QtyShipped'].to_numpy()

# Save
sparse.save_npz('X_class_train.npz', X_class_train)
sparse.save_npz('X_class_test.npz',  X_class_test)
sparse.save_npz('X_reg_train.npz',   X_reg_train)
sparse.save_npz('X_reg_test.npz',    X_reg_test)
np.save('y_class.npy', y_class)
np.save('y_reg.npy',   y_reg)

print(f"Features ✓  (X_class_train {X_class_train.shape}, X_reg_train {X_reg_train.shape})")

Featurizing...
Features ✓  (X_class_train (10736, 6787), X_reg_train (10701, 6787))


In [3]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

print("Training LightGBM models...")

# Load data
train_c = pd.read_csv("clean_train_c.csv")
test_full = pd.read_csv("clean_test_full.csv")
Xc_tr = sparse.load_npz("X_class_train.npz")
Xc_te = sparse.load_npz("X_class_test.npz")
Xr_tr = sparse.load_npz("X_reg_train.npz")
Xr_te = sparse.load_npz("X_reg_test.npz")
yc = np.load("y_class.npy")
yr = np.load("y_reg.npy")

# Deterministic mapping
det = train_c.groupby("ItemDescription")["MasterItemNo"].agg(lambda x: x.mode()[0])
det_items = det[train_c.groupby("ItemDescription")["MasterItemNo"].nunique() == 1]

# Non-deterministic subset
mask = ~train_c["ItemDescription"].isin(det_items.index)
Xc_nd, yc_nd = Xc_tr[mask.values], yc[mask.values]

# Remove rare single-sample classes
unique_classes, class_counts = np.unique(yc_nd, return_counts=True)
single_sample_classes = unique_classes[class_counts == 1]
if len(single_sample_classes) > 0:
    print(f"Removing {len(single_sample_classes)} classes with only 1 sample")
    mask_multiple = ~np.isin(yc_nd, single_sample_classes)
    Xc_nd = Xc_nd[mask_multiple]
    yc_nd = yc_nd[mask_multiple]

# Encode labels
le = LabelEncoder()
yc_enc = le.fit_transform(yc_nd)
print(f"Final class distribution: {len(np.unique(yc_enc))} classes")

# Split validation
Xc_train, Xc_val, yc_train, yc_val = train_test_split(
    Xc_nd, yc_enc, test_size=0.2, random_state=SEED
)
Xr_train, Xr_val, yr_train, yr_val = train_test_split(
    Xr_tr, yr, test_size=0.2, random_state=SEED
)

# LightGBM Classifier
print("Training LightGBM Classifier...")
lgb_classifier = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED
)
lgb_classifier.fit(Xc_train, yc_train)

# LightGBM Regressor
print("Training LightGBM Regressor...")
lgb_regressor = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED
)
lgb_regressor.fit(Xr_train, yr_train)

print("Models trained successfully!")

# Save
import joblib
joblib.dump(lgb_classifier, 'lgb_classifier.pkl')
joblib.dump(lgb_regressor, 'lgb_regressor.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(det_items, 'deterministic_mapping.pkl')

print("Models saved to disk.")


Training LightGBM models...
Removing 66 classes with only 1 sample
Final class distribution: 107 classes
Training LightGBM Classifier...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1430
[LightGBM] [Info] Number of data points in the train set: 852, number of used features: 128
[LightGBM] [Info] Start training from score -4.801676
[LightGBM] [Info] Start training from score -6.054439
[LightGBM] [Info] Start training from score -3.346389
[LightGBM] [Info] Start training from score -6.054439
[LightGBM] [Info] Start training from score -4.955827
[LightGBM] [Info] Start training from score -4.182637
[LightGBM] [Info] Start training from score -6.054439
[LightGBM] [Info] Start training from score -6.054439
[LightGBM] [Info] Start training from score -6.054439
[LightGBM] [Info] Start t

In [4]:
import numpy as np
import pandas as pd
from scipy import sparse
import joblib

print("Making predictions with LightGBM...")

# Load test and models
test_full = pd.read_csv("clean_test_full.csv")
Xc_te = sparse.load_npz("X_class_test.npz")
Xr_te = sparse.load_npz("X_reg_test.npz")

lgb_classifier = joblib.load('lgb_classifier.pkl')
lgb_regressor = joblib.load('lgb_regressor.pkl')
le = joblib.load('label_encoder.pkl')
det_items = joblib.load('deterministic_mapping.pkl')

# Classification
pred_master = np.empty(test_full.shape[0], dtype=object)
is_det = test_full["ItemDescription"].isin(det_items.index).values

pred_master[is_det] = test_full.loc[is_det, "ItemDescription"].map(det_items).to_numpy()

if (~is_det).any():
    preds_encoded = lgb_classifier.predict(Xc_te[~is_det])
    pred_master[~is_det] = le.inverse_transform(preds_encoded)

# Fill NaNs with mode
if np.any(pd.isna(pred_master)):
    most_common_master = pd.Series(le.inverse_transform(le.transform(le.classes_))).mode()[0]
    pred_master[pd.isna(pred_master)] = most_common_master

pred_master = pred_master.astype(int)

# Regression
pred_qty = lgb_regressor.predict(Xr_te)
pred_qty = np.clip(pred_qty, 1, None).astype(int)

# Submission
submission = pd.DataFrame({
    "id": test_full["id"],
    "MasterItemNo": pred_master,
    "QtyShipped": pred_qty
})
submission.to_csv("submission_lgb.csv", index=False)
print("Submission file created: submission_lgb.csv")
print(f"Submission shape: {submission.shape}")
print(f"MasterItemNo unique values: {submission['MasterItemNo'].nunique()}")
print(f"QtyShipped range: {submission['QtyShipped'].min()} to {submission['QtyShipped'].max()}")


Making predictions with LightGBM...
Submission file created: submission_lgb.csv
Submission shape: (2685, 3)
MasterItemNo unique values: 789
QtyShipped range: 1 to 987


In [5]:
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

print("Evaluating LightGBM models...")

# Classification
yc_pred = lgb_classifier.predict(Xc_val)
accuracy = accuracy_score(yc_val, yc_pred)
f1 = f1_score(yc_val, yc_pred, average="weighted")

# Regression
yr_pred = lgb_regressor.predict(Xr_val)
mae = mean_absolute_error(yr_val, yr_pred)

# Normalized regression score
if yr_val.max() == yr_val.min():
    reg_score = 1.0
else:
    norm_mae = mae / (yr_val.max() - yr_val.min())
    reg_score = 1 - max(0, min(norm_mae, 1))

# Final score
final_score = 0.25 * accuracy + 0.25 * f1 + 0.5 * reg_score

print("📊 LightGBM Evaluation Results")
print(f"Accuracy       : {accuracy:.4f}")
print(f"F1 Score       : {f1:.4f}")
print(f"MAE            : {mae:.4f}")
print(f"RegressionScore: {reg_score:.4f}")
print(f"Final Score    : {final_score:.4f}")

print(f"\n📊 Dataset Info:")
print(f"Training samples: {len(yc_train)}")
print(f"Validation samples: {len(yc_val)}")
print(f"Unique classes: {len(np.unique(yc_enc))}")


Evaluating LightGBM models...
📊 LightGBM Evaluation Results
Accuracy       : 0.7430
F1 Score       : 0.7426
MAE            : 16.9582
RegressionScore: 0.9963
Final Score    : 0.8696

📊 Dataset Info:
Training samples: 852
Validation samples: 214
Unique classes: 107
