In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## Load Data

In [64]:
# Load training data
train_df = pd.read_csv('data/data_minihackathon_train_engineered.csv')
test_df = pd.read_csv('data/data_minihackathon_test_engineered.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nClass distribution BEFORE balancing:")
print(train_df['drug_category'].value_counts())
print(f"\nPercentages:")
print(train_df['drug_category'].value_counts(normalize=True) * 100)

Training data shape: (1500, 46)
Test data shape: (377, 45)

Class distribution BEFORE balancing:
drug_category
Hallucinogens    691
Stimulants       567
Depressants      242
Name: count, dtype: int64

Percentages:
drug_category
Hallucinogens    46.066667
Stimulants       37.800000
Depressants      16.133333
Name: proportion, dtype: float64


## Balance Dataset Using Undersampling
We'll undersample Hallucinogens and Stimulants to match Depressants frequency

In [65]:
# Separate features and target
X = train_df.drop(['drug_category'], axis=1)
y = train_df['drug_category']

# Check for ID column and drop it
if 'id' in X.columns:
    X = X.drop(['id'], axis=1)

# Encode labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print("\nLabel encoding mapping:")
for label, encoded in class_mapping.items():
    print(f"  {label}: {encoded}")

Features shape: (1500, 45)
Target shape: (1500,)

Label encoding mapping:
  Depressants: 0
  Hallucinogens: 1
  Stimulants: 2


In [66]:
# Strategy: Undersample majority classes but keep a bit more data than the smallest class
# This keeps Depressants intact while allowing Hallucinogens/Stimulants slightly higher counts
target_factor = 1.5  # keep up to 150% of depressant count for other classes

# Count samples per class
class_counts = y.value_counts()
min_class_count = class_counts.min()
target_counts = {}

print("\nBalancing Strategy: Moderate undersampling with factor", target_factor)
print(f"Original class counts:\n{class_counts}\n")

for cls_name, original_count in class_counts.items():
    encoded_label = int(label_encoder.transform([cls_name])[0])
    if cls_name == 'Depressants':
        target_counts[encoded_label] = original_count
    else:
        target_counts[encoded_label] = min(int(min_class_count * target_factor), original_count)
    print(f"Target count for {cls_name}: {target_counts[encoded_label]} (original {original_count})")

# Use RandomUnderSampler with custom strategy
undersampler = RandomUnderSampler(sampling_strategy=target_counts, random_state=42)

# Create moderately balanced dataset
X_balanced, y_balanced_encoded = undersampler.fit_resample(X, y_encoded)

# Decode back to string labels for display
y_balanced = label_encoder.inverse_transform(y_balanced_encoded)
balanced_counts = pd.Series(y_balanced).value_counts()
balanced_percentages = balanced_counts / balanced_counts.sum() * 100

print(f"\nBalanced dataset shape: {X_balanced.shape}")
print("\nClass distribution AFTER balancing:")
print(balanced_counts)
print("\nPercentages:")
print(balanced_percentages.round(2))


Balancing Strategy: Moderate undersampling with factor 1.5
Original class counts:
drug_category
Hallucinogens    691
Stimulants       567
Depressants      242
Name: count, dtype: int64

Target count for Hallucinogens: 363 (original 691)
Target count for Stimulants: 363 (original 567)
Target count for Depressants: 242 (original 242)

Balanced dataset shape: (968, 45)

Class distribution AFTER balancing:
Hallucinogens    363
Stimulants       363
Depressants      242
Name: count, dtype: int64

Percentages:
Hallucinogens    37.5
Stimulants       37.5
Depressants      25.0
Name: count, dtype: float64


## Train-Test Split for Validation

In [67]:
# Split balanced data for validation
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Keep string versions for display
y_train = label_encoder.inverse_transform(y_train_encoded)
y_val = label_encoder.inverse_transform(y_val_encoded)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print("\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print("\nValidation set class distribution:")
print(pd.Series(y_val).value_counts())

Training set: (774, 45)
Validation set: (194, 45)

Training set class distribution:
Stimulants       290
Hallucinogens    290
Depressants      194
Name: count, dtype: int64

Validation set class distribution:
Stimulants       73
Hallucinogens    73
Depressants      48
Name: count, dtype: int64


## Train Multiple Models on Balanced Data

In [68]:
# Model 1: XGBoost with balanced classes
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=26,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss',
    tree_method='hist'
)

print("Training XGBoost...")
xgb_model.fit(X_train, y_train_encoded)
xgb_pred_encoded = xgb_model.predict(X_val)
xgb_pred = label_encoder.inverse_transform(xgb_pred_encoded)
xgb_acc = accuracy_score(y_val, xgb_pred)
print(f"XGBoost Validation Accuracy: {xgb_acc:.4f}")
print("\nXGBoost Classification Report:")
print(classification_report(y_val, xgb_pred))

Training XGBoost...
XGBoost Validation Accuracy: 0.6495

XGBoost Classification Report:
               precision    recall  f1-score   support

  Depressants       0.30      0.17      0.21        48
Hallucinogens       0.80      0.81      0.80        73
   Stimulants       0.63      0.81      0.71        73

     accuracy                           0.65       194
    macro avg       0.58      0.59      0.58       194
 weighted avg       0.61      0.65      0.62       194

XGBoost Validation Accuracy: 0.6495

XGBoost Classification Report:
               precision    recall  f1-score   support

  Depressants       0.30      0.17      0.21        48
Hallucinogens       0.80      0.81      0.80        73
   Stimulants       0.63      0.81      0.71        73

     accuracy                           0.65       194
    macro avg       0.58      0.59      0.58       194
 weighted avg       0.61      0.65      0.62       194



In [69]:
# Model 2: LightGBM with balanced classes
lgbm_model = LGBMClassifier(
    n_estimators=500,
    max_depth=26,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

print("Training LightGBM...")
lgbm_model.fit(X_train, y_train_encoded)
lgbm_pred_encoded = lgbm_model.predict(X_val)
lgbm_pred = label_encoder.inverse_transform(lgbm_pred_encoded)
lgbm_acc = accuracy_score(y_val, lgbm_pred)
print(f"LightGBM Validation Accuracy: {lgbm_acc:.4f}")
print("\nLightGBM Classification Report:")
print(classification_report(y_val, lgbm_pred))

Training LightGBM...
LightGBM Validation Accuracy: 0.6289

LightGBM Classification Report:
               precision    recall  f1-score   support

  Depressants       0.27      0.17      0.21        48
Hallucinogens       0.79      0.78      0.79        73
   Stimulants       0.62      0.78      0.69        73

     accuracy                           0.63       194
    macro avg       0.56      0.58      0.56       194
 weighted avg       0.60      0.63      0.61       194

LightGBM Validation Accuracy: 0.6289

LightGBM Classification Report:
               precision    recall  f1-score   support

  Depressants       0.27      0.17      0.21        48
Hallucinogens       0.79      0.78      0.79        73
   Stimulants       0.62      0.78      0.69        73

     accuracy                           0.63       194
    macro avg       0.56      0.58      0.56       194
 weighted avg       0.60      0.63      0.61       194



In [70]:
# Model 3: Random Forest with balanced classes
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest...")
rf_model.fit(X_train, y_train_encoded)
rf_pred_encoded = rf_model.predict(X_val)
rf_pred = label_encoder.inverse_transform(rf_pred_encoded)
rf_acc = accuracy_score(y_val, rf_pred)
print(f"Random Forest Validation Accuracy: {rf_acc:.4f}")
print("\nRandom Forest Classification Report:")
print(classification_report(y_val, rf_pred))

Training Random Forest...
Random Forest Validation Accuracy: 0.6495

Random Forest Classification Report:
               precision    recall  f1-score   support

  Depressants       0.30      0.15      0.20        48
Hallucinogens       0.76      0.81      0.78        73
   Stimulants       0.65      0.82      0.72        73

     accuracy                           0.65       194
    macro avg       0.57      0.59      0.57       194
 weighted avg       0.60      0.65      0.61       194

Random Forest Validation Accuracy: 0.6495

Random Forest Classification Report:
               precision    recall  f1-score   support

  Depressants       0.30      0.15      0.20        48
Hallucinogens       0.76      0.81      0.78        73
   Stimulants       0.65      0.82      0.72        73

     accuracy                           0.65       194
    macro avg       0.57      0.59      0.57       194
 weighted avg       0.60      0.65      0.61       194



In [71]:
# Model 4: Gradient Boosting
gb_model = GradientBoostingClassifier(
    n_estimators=300,
    max_depth=15,
    learning_rate=0.01,
    subsample=0.8,
    random_state=42
)

print("Training Gradient Boosting...")
gb_model.fit(X_train, y_train_encoded)
gb_pred_encoded = gb_model.predict(X_val)
gb_pred = label_encoder.inverse_transform(gb_pred_encoded)
gb_acc = accuracy_score(y_val, gb_pred)
print(f"Gradient Boosting Validation Accuracy: {gb_acc:.4f}")
print("\nGradient Boosting Classification Report:")
print(classification_report(y_val, gb_pred))

Training Gradient Boosting...
Gradient Boosting Validation Accuracy: 0.6443

Gradient Boosting Classification Report:
               precision    recall  f1-score   support

  Depressants       0.23      0.10      0.14        48
Hallucinogens       0.78      0.84      0.81        73
   Stimulants       0.63      0.81      0.71        73

     accuracy                           0.64       194
    macro avg       0.55      0.58      0.55       194
 weighted avg       0.59      0.64      0.61       194



## Create Voting Ensemble

In [72]:
# Create voting ensemble with soft voting (uses probabilities)
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgbm', lgbm_model),
        ('rf', rf_model),
        ('gb', gb_model)
    ],
    voting='soft'
)

print("Training Voting Ensemble...")
voting_clf.fit(X_train, y_train_encoded)
voting_pred_encoded = voting_clf.predict(X_val)
voting_pred = label_encoder.inverse_transform(voting_pred_encoded)
voting_acc = accuracy_score(y_val, voting_pred)
print(f"Voting Ensemble Validation Accuracy: {voting_acc:.4f}")
print("\nVoting Ensemble Classification Report:")
print(classification_report(y_val, voting_pred))

Training Voting Ensemble...
Voting Ensemble Validation Accuracy: 0.6340

Voting Ensemble Classification Report:
               precision    recall  f1-score   support

  Depressants       0.20      0.10      0.14        48
Hallucinogens       0.79      0.81      0.80        73
   Stimulants       0.63      0.81      0.71        73

     accuracy                           0.63       194
    macro avg       0.54      0.57      0.55       194
 weighted avg       0.58      0.63      0.60       194

Voting Ensemble Validation Accuracy: 0.6340

Voting Ensemble Classification Report:
               precision    recall  f1-score   support

  Depressants       0.20      0.10      0.14        48
Hallucinogens       0.79      0.81      0.80        73
   Stimulants       0.63      0.81      0.71        73

     accuracy                           0.63       194
    macro avg       0.54      0.57      0.55       194
 weighted avg       0.58      0.63      0.60       194



## Model Comparison

In [73]:
# Compare all models
results = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'Random Forest', 'Gradient Boosting', 'Voting Ensemble'],
    'Accuracy': [xgb_acc, lgbm_acc, rf_acc, gb_acc, voting_acc]
})

results = results.sort_values('Accuracy', ascending=False)
print("\n" + "="*50)
print("MODEL COMPARISON (sorted by accuracy)")
print("="*50)
print(results.to_string(index=False))
print("="*50)

best_model_name = results.iloc[0]['Model']
best_accuracy = results.iloc[0]['Accuracy']
print(f"\nüèÜ Best Model: {best_model_name} with accuracy {best_accuracy:.4f}")


MODEL COMPARISON (sorted by accuracy)
            Model  Accuracy
          XGBoost  0.649485
    Random Forest  0.649485
Gradient Boosting  0.644330
  Voting Ensemble  0.634021
         LightGBM  0.628866

üèÜ Best Model: XGBoost with accuracy 0.6495


## Cross-Validation on Best Model

In [74]:
# Perform cross-validation on the voting ensemble
cv_scores = cross_val_score(
    voting_clf, X_balanced, y_balanced_encoded, cv=5, scoring='accuracy'
)

print(f"\nCross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

KeyboardInterrupt: 

## Check Confusion Matrix for All Classes

In [None]:
# Confusion matrix for voting ensemble
cm = confusion_matrix(y_val, voting_pred)
classes = label_encoder.classes_

print("\nConfusion Matrix (Voting Ensemble):")
cm_df = pd.DataFrame(cm, index=classes, columns=classes)
print(cm_df)

# Calculate per-class accuracy
print("\nPer-Class Accuracy:")
for idx, cls in enumerate(classes):
    row_sum = cm[idx, :].sum()
    class_acc = cm[idx, idx] / row_sum if row_sum else 0.0
    print(f"{cls}: {class_acc:.4f} ({cm[idx, idx]}/{row_sum})")


Confusion Matrix (Voting Ensemble):
               Depressants  Hallucinogens  Stimulants
Depressants             10             11          27
Hallucinogens           11             56           6
Stimulants              12              5          56

Per-Class Accuracy:
Depressants: 0.2083 (10/48)
Hallucinogens: 0.7671 (56/73)
Stimulants: 0.7671 (56/73)


## Retrain on Full Balanced Dataset

In [75]:
# Retrain voting ensemble on full balanced dataset
print("Retraining Voting Ensemble on full balanced dataset...")

# Recreate models with same parameters
final_xgb = XGBClassifier(
    n_estimators=500, max_depth=26, learning_rate=0.01, subsample=0.8,
    colsample_bytree=0.8, random_state=42, eval_metric='mlogloss', tree_method='hist'
)

final_lgbm = LGBMClassifier(
    n_estimators=500, max_depth=26, learning_rate=0.01, subsample=0.8,
    colsample_bytree=0.8, random_state=42, verbose=-1
)

final_rf = RandomForestClassifier(
    n_estimators=500, max_depth=20, min_samples_split=5,
    min_samples_leaf=2, random_state=42, n_jobs=-1
)

final_gb = GradientBoostingClassifier(
    n_estimators=300, max_depth=15, learning_rate=0.01,
    subsample=0.8, random_state=42
)

# Create final voting ensemble
final_voting = VotingClassifier(
    estimators=[('xgb', final_xgb), ('lgbm', final_lgbm), ('rf', final_rf), ('gb', final_gb)],
    voting='soft'
)

# Train on full balanced dataset
final_voting.fit(X_balanced, y_balanced_encoded)
print("‚úì Training complete!")

Retraining Voting Ensemble on full balanced dataset...
‚úì Training complete!
‚úì Training complete!


## Prepare Test Data and Generate Predictions

In [76]:
# Prepare test data
X_test = test_df.copy()

# Drop ID column if present
if 'id' in X_test.columns:
    X_test = X_test.drop(['id'], axis=1)

# Ensure test data has same columns as training data
missing_cols = set(X_balanced.columns) - set(X_test.columns)
extra_cols = set(X_test.columns) - set(X_balanced.columns)

if missing_cols:
    print(f"Warning: Test data missing columns: {missing_cols}")
if extra_cols:
    print(f"Warning: Test data has extra columns: {extra_cols}")
    X_test = X_test[X_balanced.columns]

print(f"Test data shape: {X_test.shape}")
print(f"Expected shape: ({len(test_df)}, {X_balanced.shape[1]})")

Test data shape: (377, 45)
Expected shape: (377, 45)


In [77]:
# Generate predictions
print("Generating predictions...")
predictions_encoded = final_voting.predict(X_test)
predictions = label_encoder.inverse_transform(predictions_encoded)
prediction_probs = final_voting.predict_proba(X_test)

print(f"\nPrediction distribution:")
print(pd.Series(predictions).value_counts())
print(f"\nPercentages:")
print(pd.Series(predictions).value_counts(normalize=True) * 100)

Generating predictions...

Prediction distribution:
Hallucinogens    171
Stimulants       154
Depressants       52
Name: count, dtype: int64

Percentages:
Hallucinogens    45.358090
Stimulants       40.848806
Depressants      13.793103
Name: proportion, dtype: float64

Prediction distribution:
Hallucinogens    171
Stimulants       154
Depressants       52
Name: count, dtype: int64

Percentages:
Hallucinogens    45.358090
Stimulants       40.848806
Depressants      13.793103
Name: proportion, dtype: float64


## Validate Against Known Depressants IDs

In [78]:
# Check predictions for confirmed Depressants IDs
confirmed_depressants = [513, 521, 570, 642, 770]

print("\n" + "="*60)
print("VALIDATION: Checking Confirmed Depressants IDs")
print("="*60)

# Get class names (need to decode from numeric to string)
class_names = label_encoder.classes_
depressants_idx = list(class_names).index('Depressants')

for dep_id in confirmed_depressants:
    test_idx = dep_id - 501  # Convert submission ID to test index
    pred = predictions[test_idx]
    prob = prediction_probs[test_idx]
    dep_prob = prob[depressants_idx]
    
    status = "‚úì CAUGHT" if pred == 'Depressants' else "‚ùå MISSED"
    print(f"ID {dep_id}: Predicted={pred}, Depressants_prob={dep_prob:.4f} {status}")

caught = sum(1 for dep_id in confirmed_depressants if predictions[dep_id - 501] == 'Depressants')
print(f"\nSuccess Rate: {caught}/{len(confirmed_depressants)} ({100*caught/len(confirmed_depressants):.1f}%)")
print("="*60)


VALIDATION: Checking Confirmed Depressants IDs
ID 513: Predicted=Stimulants, Depressants_prob=0.2909 ‚ùå MISSED
ID 521: Predicted=Hallucinogens, Depressants_prob=0.2699 ‚ùå MISSED
ID 570: Predicted=Stimulants, Depressants_prob=0.3759 ‚ùå MISSED
ID 642: Predicted=Hallucinogens, Depressants_prob=0.3744 ‚ùå MISSED
ID 770: Predicted=Depressants, Depressants_prob=0.4473 ‚úì CAUGHT

Success Rate: 1/5 (20.0%)


## Create Submission File with IDs Starting from 501

In [79]:
# Create submission DataFrame
submission = pd.DataFrame({
    'ID': range(501, 501 + len(predictions)),  # IDs from 501 to 877
    'drug_category': predictions
})

# Generate timestamp for filename
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'submission_BALANCED_3CLASS_{timestamp}.csv'

# Save submission
submission.to_csv(filename, index=False)
print(f"\n‚úì Submission file created: {filename}")
print(f"\nSubmission shape: {submission.shape}")
print(f"\nFirst few rows:")
print(submission.head(10))
print(f"\nLast few rows:")
print(submission.tail(10))
print(f"\nFinal prediction distribution:")
print(submission['drug_category'].value_counts())
print(f"\nPercentages:")
print(submission['drug_category'].value_counts(normalize=True) * 100)


‚úì Submission file created: submission_BALANCED_3CLASS_20251117_100513.csv

Submission shape: (377, 2)

First few rows:
    ID  drug_category
0  501     Stimulants
1  502     Stimulants
2  503     Stimulants
3  504  Hallucinogens
4  505     Stimulants
5  506  Hallucinogens
6  507    Depressants
7  508     Stimulants
8  509  Hallucinogens
9  510     Stimulants

Last few rows:
      ID  drug_category
367  868     Stimulants
368  869    Depressants
369  870  Hallucinogens
370  871     Stimulants
371  872  Hallucinogens
372  873  Hallucinogens
373  874  Hallucinogens
374  875  Hallucinogens
375  876  Hallucinogens
376  877  Hallucinogens

Final prediction distribution:
drug_category
Hallucinogens    171
Stimulants       154
Depressants       52
Name: count, dtype: int64

Percentages:
drug_category
Hallucinogens    45.358090
Stimulants       40.848806
Depressants      13.793103
Name: proportion, dtype: float64


## Summary Statistics

In [80]:
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Training Strategy: Balanced dataset (undersampling)")
print(f"Original training samples: {len(train_df)}")
print(f"Balanced training samples: {len(X_balanced)}")
print(f"Test samples: {len(test_df)}")
print(f"\nModel: Voting Ensemble (XGBoost + LightGBM + RF + GB)")
print(f"Validation Accuracy: {voting_acc:.4f}")
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"\nPredicted Depressants: {(predictions == 'Depressants').sum()} ({100*(predictions == 'Depressants').sum()/len(predictions):.1f}%)")
print(f"Predicted Hallucinogens: {(predictions == 'Hallucinogens').sum()} ({100*(predictions == 'Hallucinogens').sum()/len(predictions):.1f}%)")
print(f"Predicted Stimulants: {(predictions == 'Stimulants').sum()} ({100*(predictions == 'Stimulants').sum()/len(predictions):.1f}%)")
print(f"\nConfirmed Depressants caught: {caught}/{len(confirmed_depressants)}")
print(f"\nSubmission file: {filename}")
print("="*60)


FINAL SUMMARY
Training Strategy: Balanced dataset (undersampling)
Original training samples: 1500
Balanced training samples: 968
Test samples: 377

Model: Voting Ensemble (XGBoost + LightGBM + RF + GB)
Validation Accuracy: 0.6340
Cross-Validation Accuracy: 0.6157 (+/- 0.0717)

Predicted Depressants: 52 (13.8%)
Predicted Hallucinogens: 171 (45.4%)
Predicted Stimulants: 154 (40.8%)

Confirmed Depressants caught: 1/5

Submission file: submission_BALANCED_3CLASS_20251117_100513.csv
