In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## Load Data

In [8]:
# Load training data
train_df = pd.read_csv('data/data_minihackathon_train_engineered.csv')
test_df = pd.read_csv('data/data_minihackathon_test_engineered.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nClass distribution BEFORE balancing:")
print(train_df['drug_category'].value_counts())
print(f"\nPercentages:")
print(train_df['drug_category'].value_counts(normalize=True) * 100)

Training data shape: (1500, 46)
Test data shape: (377, 45)

Class distribution BEFORE balancing:
drug_category
Hallucinogens    691
Stimulants       567
Depressants      242
Name: count, dtype: int64

Percentages:
drug_category
Hallucinogens    46.066667
Stimulants       37.800000
Depressants      16.133333
Name: proportion, dtype: float64


## Balance Dataset Using Undersampling
We'll undersample Hallucinogens and Stimulants to match Depressants frequency

In [None]:
# Separate features and target
X = train_df.drop(['drug_category'], axis=1)
y = train_df['drug_category']

# Check for ID column and drop it
if 'id' in X.columns:
    X = X.drop(['id'], axis=1)

# Encode labels to numeric values
label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)print(f"\nLabel encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

print(f"Target shape: {y.shape}")
print(f"Features shape: {X.shape}")

Features shape: (1500, 45)
Target shape: (1500,)


In [None]:
# Strategy 1: Balance all classes equally (1:1:1 ratio)
# This gives each class equal representation

# Count samples per class
class_counts = y.value_counts()
min_class_count = class_counts.min()

print(f"\nBalancing Strategy: Equal representation for all classes")
print(f"Target samples per class: {min_class_count}")

# Use RandomUnderSampler to balance classes
from imblearn.under_sampling import RandomUnderSampler

# Create balanced dataset (1:1:1 ratio)
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_balanced, y_balanced_encoded = undersampler.fit_resample(X, y_encoded)

# Decode back to string labels for display
y_balanced = label_encoder.inverse_transform(y_balanced_encoded)

print(f"\nBalanced dataset shape: {X_balanced.shape}")
print(f"\nClass distribution AFTER balancing:")

print(pd.Series(y_balanced).value_counts())print(pd.Series(y_balanced).value_counts(normalize=True) * 100)
print(f"\nPercentages:")


Balancing Strategy: Equal representation for all classes
Target samples per class: 242

Balanced dataset shape: (726, 45)

Class distribution AFTER balancing:
drug_category
Depressants      242
Hallucinogens    242
Stimulants       242
Name: count, dtype: int64

Percentages:
drug_category
Depressants      33.333333
Hallucinogens    33.333333
Stimulants       33.333333
Name: proportion, dtype: float64


## Train-Test Split for Validation

In [None]:
# Split balanced data for validation
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.2, random_state=42, stratify=y_balanced_encoded
)

# Keep string versions for display
y_train = label_encoder.inverse_transform(y_train_encoded)
y_val = label_encoder.inverse_transform(y_val_encoded)


print(f"Training set: {X_train.shape}")print(pd.Series(y_train).value_counts())

print(f"Validation set: {X_val.shape}")print(f"\nTraining set class distribution:")

Training set: (580, 45)
Validation set: (146, 45)

Training set class distribution:
drug_category
Depressants      194
Stimulants       193
Hallucinogens    193
Name: count, dtype: int64


## Train Multiple Models on Balanced Data

In [None]:
# Model 1: XGBoost with balanced classes
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss',
    tree_method='hist'
)

print("Training XGBoost...")
xgb_model.fit(X_train, y_train_encoded)
xgb_pred_encoded = xgb_model.predict(X_val)
xgb_pred = label_encoder.inverse_transform(xgb_pred_encoded)
xgb_acc = accuracy_score(y_val, xgb_pred)
print(f"XGBoost Validation Accuracy: {xgb_acc:.4f}")
print("\nXGBoost Classification Report:")
print(classification_report(y_val, xgb_pred))

Training XGBoost...


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['Depressants' 'Hallucinogens' 'Stimulants']

In [None]:
# Model 2: LightGBM with balanced classes
lgbm_model = LGBMClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

print("Training LightGBM...")
lgbm_model.fit(X_train, y_train_encoded)
lgbm_pred_encoded = lgbm_model.predict(X_val)
lgbm_pred = label_encoder.inverse_transform(lgbm_pred_encoded)
lgbm_acc = accuracy_score(y_val, lgbm_pred)
print(f"LightGBM Validation Accuracy: {lgbm_acc:.4f}")
print("\nLightGBM Classification Report:")
print(classification_report(y_val, lgbm_pred))

In [None]:
# Model 3: Random Forest with balanced classes
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest...")
rf_model.fit(X_train, y_train_encoded)
rf_pred_encoded = rf_model.predict(X_val)
rf_pred = label_encoder.inverse_transform(rf_pred_encoded)
rf_acc = accuracy_score(y_val, rf_pred)
print(f"Random Forest Validation Accuracy: {rf_acc:.4f}")
print("\nRandom Forest Classification Report:")
print(classification_report(y_val, rf_pred))

In [None]:
# Model 4: Gradient Boosting
gb_model = GradientBoostingClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    random_state=42
)

print("Training Gradient Boosting...")
gb_model.fit(X_train, y_train_encoded)
gb_pred_encoded = gb_model.predict(X_val)
gb_pred = label_encoder.inverse_transform(gb_pred_encoded)
gb_acc = accuracy_score(y_val, gb_pred)
print(f"Gradient Boosting Validation Accuracy: {gb_acc:.4f}")
print("\nGradient Boosting Classification Report:")
print(classification_report(y_val, gb_pred))

## Create Voting Ensemble

In [None]:
# Create voting ensemble with soft voting (uses probabilities)
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgbm', lgbm_model),
        ('rf', rf_model),
        ('gb', gb_model)
    ],
    voting='soft'
)

print("Training Voting Ensemble...")
voting_clf.fit(X_train, y_train_encoded)
voting_pred_encoded = voting_clf.predict(X_val)
voting_pred = label_encoder.inverse_transform(voting_pred_encoded)
voting_acc = accuracy_score(y_val, voting_pred)
print(f"Voting Ensemble Validation Accuracy: {voting_acc:.4f}")
print("\nVoting Ensemble Classification Report:")
print(classification_report(y_val, voting_pred))

## Model Comparison

In [None]:
# Compare all models
results = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'Random Forest', 'Gradient Boosting', 'Voting Ensemble'],
    'Accuracy': [xgb_acc, lgbm_acc, rf_acc, gb_acc, voting_acc]
})

results = results.sort_values('Accuracy', ascending=False)
print("\n" + "="*50)
print("MODEL COMPARISON (sorted by accuracy)")
print("="*50)
print(results.to_string(index=False))
print("="*50)

best_model_name = results.iloc[0]['Model']
best_accuracy = results.iloc[0]['Accuracy']
print(f"\nüèÜ Best Model: {best_model_name} with accuracy {best_accuracy:.4f}")

## Cross-Validation on Best Model

In [None]:
# Perform cross-validation on the voting ensemble
cv_scores = cross_val_score(
    voting_clf, X_balanced, y_balanced_encoded, cv=5, scoring='accuracy'
)

print(f"\nCross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## Check Confusion Matrix for All Classes

In [None]:
# Confusion matrix for voting ensemble
cm = confusion_matrix(y_val, voting_pred)
classes = sorted(y_balanced.unique())

print("\nConfusion Matrix (Voting Ensemble):")
cm_df = pd.DataFrame(cm, index=classes, columns=classes)
print(cm_df)

# Calculate per-class accuracy
print("\nPer-Class Accuracy:")
for i, cls in enumerate(classes):
    class_acc = cm[i, i] / cm[i, :].sum()
    print(f"{cls}: {class_acc:.4f} ({cm[i, i]}/{cm[i, :].sum()})")

## Retrain on Full Balanced Dataset

In [None]:
# Retrain voting ensemble on full balanced dataset
print("Retraining Voting Ensemble on full balanced dataset...")

# Recreate models with same parameters
final_xgb = XGBClassifier(
    n_estimators=500, max_depth=6, learning_rate=0.05, subsample=0.8,
    colsample_bytree=0.8, random_state=42, eval_metric='mlogloss', tree_method='hist'
)

final_lgbm = LGBMClassifier(
    n_estimators=500, max_depth=6, learning_rate=0.05, subsample=0.8,
    colsample_bytree=0.8, random_state=42, verbose=-1
)

final_rf = RandomForestClassifier(
    n_estimators=500, max_depth=10, min_samples_split=5,
    min_samples_leaf=2, random_state=42, n_jobs=-1
)

final_gb = GradientBoostingClassifier(
    n_estimators=300, max_depth=5, learning_rate=0.05,
    subsample=0.8, random_state=42
)

# Create final voting ensemble
final_voting = VotingClassifier(
    estimators=[('xgb', final_xgb), ('lgbm', final_lgbm), ('rf', final_rf), ('gb', final_gb)],
    voting='soft'
)

# Train on full balanced dataset
final_voting.fit(X_balanced, y_balanced)
print("‚úì Training complete!")

## Prepare Test Data and Generate Predictions

In [None]:
# Prepare test data
X_test = test_df.copy()

# Drop ID column if present
if 'id' in X_test.columns:
    X_test = X_test.drop(['id'], axis=1)

# Ensure test data has same columns as training data
missing_cols = set(X_balanced.columns) - set(X_test.columns)
extra_cols = set(X_test.columns) - set(X_balanced.columns)

if missing_cols:
    print(f"Warning: Test data missing columns: {missing_cols}")
if extra_cols:
    print(f"Warning: Test data has extra columns: {extra_cols}")
    X_test = X_test[X_balanced.columns]

print(f"Test data shape: {X_test.shape}")
print(f"Expected shape: ({len(test_df)}, {X_balanced.shape[1]})")

In [None]:
# Generate predictions
print("Generating predictions...")
predictions = final_voting.predict(X_test)
prediction_probs = final_voting.predict_proba(X_test)

print(f"\nPrediction distribution:")
print(pd.Series(predictions).value_counts())
print(f"\nPercentages:")
print(pd.Series(predictions).value_counts(normalize=True) * 100)

## Validate Against Known Depressants IDs

In [None]:
# Check predictions for confirmed Depressants IDs
confirmed_depressants = [513, 521, 570, 642, 770]

print("\n" + "="*60)
print("VALIDATION: Checking Confirmed Depressants IDs")
print("="*60)

# Get class names
class_names = final_voting.classes_
depressants_idx = list(class_names).index('Depressants')

for dep_id in confirmed_depressants:
    test_idx = dep_id - 501  # Convert submission ID to test index
    pred = predictions[test_idx]
    prob = prediction_probs[test_idx]
    dep_prob = prob[depressants_idx]
    
    status = "‚úì CAUGHT" if pred == 'Depressants' else "‚ùå MISSED"
    print(f"ID {dep_id}: Predicted={pred}, Depressants_prob={dep_prob:.4f} {status}")

caught = sum(1 for dep_id in confirmed_depressants if predictions[dep_id - 501] == 'Depressants')
print(f"\nSuccess Rate: {caught}/{len(confirmed_depressants)} ({100*caught/len(confirmed_depressants):.1f}%)")
print("="*60)

## Create Submission File with IDs Starting from 501

In [None]:
# Create submission DataFrame
submission = pd.DataFrame({
    'ID': range(501, 501 + len(predictions)),  # IDs from 501 to 877
    'drug_category': predictions
})

# Generate timestamp for filename
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'submission_BALANCED_3CLASS_{timestamp}.csv'

# Save submission
submission.to_csv(filename, index=False)
print(f"\n‚úì Submission file created: {filename}")
print(f"\nSubmission shape: {submission.shape}")
print(f"\nFirst few rows:")
print(submission.head(10))
print(f"\nLast few rows:")
print(submission.tail(10))
print(f"\nFinal prediction distribution:")
print(submission['drug_category'].value_counts())
print(f"\nPercentages:")
print(submission['drug_category'].value_counts(normalize=True) * 100)

## Summary Statistics

In [None]:
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Training Strategy: Balanced dataset (undersampling)")
print(f"Original training samples: {len(train_df)}")
print(f"Balanced training samples: {len(X_balanced)}")
print(f"Test samples: {len(test_df)}")
print(f"\nModel: Voting Ensemble (XGBoost + LightGBM + RF + GB)")
print(f"Validation Accuracy: {voting_acc:.4f}")
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"\nPredicted Depressants: {(predictions == 'Depressants').sum()} ({100*(predictions == 'Depressants').sum()/len(predictions):.1f}%)")
print(f"Predicted Hallucinogens: {(predictions == 'Hallucinogens').sum()} ({100*(predictions == 'Hallucinogens').sum()/len(predictions):.1f}%)")
print(f"Predicted Stimulants: {(predictions == 'Stimulants').sum()} ({100*(predictions == 'Stimulants').sum()/len(predictions):.1f}%)")
print(f"\nConfirmed Depressants caught: {caught}/{len(confirmed_depressants)}")
print(f"\nSubmission file: {filename}")
print("="*60)