In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [4]:
print("=" * 60)
print("IMPROVED DRUG CLASSIFICATION MODEL")
print("=" * 60)

# Load data
print("\n[1/8] Loading data...")
train_df = pd.read_csv('data_minihackathon_train.csv')
test_df = pd.read_csv('data_minihackathon_test.csv')
print(f"✓ Train shape: {train_df.shape}")
print(f"✓ Test shape: {test_df.shape}")

IMPROVED DRUG CLASSIFICATION MODEL

[1/8] Loading data...
✓ Train shape: (1500, 14)
✓ Test shape: (377, 13)


In [5]:
# Handle missing values
print("\n[2/8] Handling missing values...")
train_missing = train_df.isnull().sum().sum()
test_missing = test_df.isnull().sum().sum()
print(f"✓ Train missing values: {train_missing}")
print(f"✓ Test missing values: {test_missing}")


[2/8] Handling missing values...
✓ Train missing values: 31
✓ Test missing values: 12


In [6]:
# Fill missing values with median
for col in train_df.columns:
    if train_df[col].isnull().any():
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        test_df[col].fillna(median_val, inplace=True)

In [7]:
# Feature engineering
print("\n[3/8] Engineering features...")
def create_features(df):
    df_new = df.copy()
    
    # Personality score interactions
    df_new['N_E_interaction'] = df_new['Nscore'] * df_new['Escore']
    df_new['O_A_interaction'] = df_new['Oscore'] * df_new['Ascore']
    df_new['C_SS_interaction'] = df_new['Cscore'] * df_new['SS']
    df_new['Impulsive_SS'] = df_new['Impulsive'] * df_new['SS']
    
    # Squared features for non-linear patterns
    df_new['Nscore_sq'] = df_new['Nscore'] ** 2
    df_new['SS_sq'] = df_new['SS'] ** 2
    df_new['Impulsive_sq'] = df_new['Impulsive'] ** 2
    
    # Risk profile
    df_new['risk_score'] = (df_new['Nscore'] + df_new['Impulsive'] + 
                             df_new['SS'] - df_new['Cscore'])
    
    # Personality sum and mean
    personality_cols = ['Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore']
    df_new['personality_sum'] = df_new[personality_cols].sum(axis=1)
    df_new['personality_mean'] = df_new[personality_cols].mean(axis=1)
    
    # Age groups
    df_new['Age_squared'] = df_new['Age'] ** 2
    
    return df_new

train_df = create_features(train_df)
test_df = create_features(test_df)
print(f"✓ Created {train_df.shape[1] - 14} new features")


[3/8] Engineering features...
✓ Created 11 new features


In [8]:
# Prepare features and target
X = train_df.drop(['ID', 'drug_category'], axis=1)
y = train_df['drug_category']
X_test = test_df.drop(['ID'], axis=1)

print(f"\n[4/8] Preparing data...")
print(f"✓ Feature matrix shape: {X.shape}")
print(f"✓ Target classes: {y.unique()}")
print(f"✓ Class distribution:")
for cls in sorted(y.unique()):
    count = (y == cls).sum()
    pct = 100 * count / len(y)
    print(f"  {cls}: {count} ({pct:.1f}%)")


[4/8] Preparing data...
✓ Feature matrix shape: (1500, 23)
✓ Target classes: ['Depressants' 'Hallucinogens' 'Stimulants']
✓ Class distribution:
  Depressants: 242 (16.1%)
  Hallucinogens: 691 (46.1%)
  Stimulants: 567 (37.8%)


In [9]:
# Scale features
print("\n[5/8] Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
print("✓ Features scaled")


[5/8] Scaling features...
✓ Features scaled


In [10]:
# Define models with optimized parameters
print("\n[6/8] Training ensemble models...")
models = {
    'XGBoost': XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='mlogloss'
    ),
    'LightGBM': LGBMClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=42,
        n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        random_state=42
    ),
    'LogisticRegression': LogisticRegression(
        max_iter=1000,
        C=0.1,
        random_state=42,
        n_jobs=-1
    )
}


[6/8] Training ensemble models...


In [11]:
# Cross-validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = {}
test_predictions = {}

for name, model in models.items():
    print(f"\n  Training {name}...")
    fold_scores = []
    fold_predictions = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
        X_train_fold = X_scaled[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_val_fold = X_scaled[val_idx]
        y_val_fold = y.iloc[val_idx]
        
        model_clone = model.__class__(**model.get_params())
        model_clone.fit(X_train_fold, y_train_fold)
        
        val_pred = model_clone.predict(X_val_fold)
        fold_score = accuracy_score(y_val_fold, val_pred)
        fold_scores.append(fold_score)
        
        # Predict on test set
        test_pred = model_clone.predict(X_test_scaled)
        fold_predictions.append(test_pred)
    
    cv_scores[name] = fold_scores
    # Average predictions across folds (majority voting)
    test_predictions[name] = np.array(fold_predictions)
    
    mean_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    print(f"    CV Accuracy: {mean_score:.4f} (+/- {std_score:.4f})")


  Training XGBoost...


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['Depressants' 'Hallucinogens' 'Stimulants']

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("IMPROVED DRUG CLASSIFICATION MODEL")
print("=" * 60)

# Load data
print("\n[1/8] Loading data...")
train_df = pd.read_csv('data_minihackathon_train.csv')
test_df = pd.read_csv('data_minihackathon_test.csv')
print(f"✓ Train shape: {train_df.shape}")
print(f"✓ Test shape: {test_df.shape}")

# Handle missing values
print("\n[2/8] Handling missing values...")
train_missing = train_df.isnull().sum().sum()
test_missing = test_df.isnull().sum().sum()
print(f"✓ Train missing values: {train_missing}")
print(f"✓ Test missing values: {test_missing}")

# Fill missing values with median
for col in train_df.columns:
    if train_df[col].isnull().any():
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        test_df[col].fillna(median_val, inplace=True)

# Feature engineering
print("\n[3/8] Engineering features...")
def create_features(df):
    df_new = df.copy()
    
    # Personality score interactions
    df_new['N_E_interaction'] = df_new['Nscore'] * df_new['Escore']
    df_new['O_A_interaction'] = df_new['Oscore'] * df_new['Ascore']
    df_new['C_SS_interaction'] = df_new['Cscore'] * df_new['SS']
    df_new['Impulsive_SS'] = df_new['Impulsive'] * df_new['SS']
    
    # Squared features for non-linear patterns
    df_new['Nscore_sq'] = df_new['Nscore'] ** 2
    df_new['SS_sq'] = df_new['SS'] ** 2
    df_new['Impulsive_sq'] = df_new['Impulsive'] ** 2
    
    # Risk profile
    df_new['risk_score'] = (df_new['Nscore'] + df_new['Impulsive'] + 
                             df_new['SS'] - df_new['Cscore'])
    
    # Personality sum and mean
    personality_cols = ['Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore']
    df_new['personality_sum'] = df_new[personality_cols].sum(axis=1)
    df_new['personality_mean'] = df_new[personality_cols].mean(axis=1)
    
    # Age groups
    df_new['Age_squared'] = df_new['Age'] ** 2
    
    return df_new

train_df = create_features(train_df)
test_df = create_features(test_df)
print(f"✓ Created {train_df.shape[1] - 14} new features")

# Prepare features and target
X = train_df.drop(['ID', 'drug_category'], axis=1)
y = train_df['drug_category']
X_test = test_df.drop(['ID'], axis=1)

print(f"\n[4/8] Preparing data...")
print(f"✓ Feature matrix shape: {X.shape}")
print(f"✓ Target classes: {y.unique()}")

# Encode target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"✓ Class mapping: {class_mapping}")

print(f"✓ Class distribution:")
for cls in sorted(y.unique()):
    count = (y == cls).sum()
    pct = 100 * count / len(y)
    print(f"  {cls}: {count} ({pct:.1f}%)")

# Update y to use encoded values
y = pd.Series(y_encoded, index=y.index)

# Scale features
print("\n[5/8] Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
print("✓ Features scaled")

# Define models with optimized parameters
print("\n[6/8] Training ensemble models...")
models = {
    'XGBoost': XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='mlogloss'
    ),
    'LightGBM': LGBMClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=42,
        n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        random_state=42
    ),
    'LogisticRegression': LogisticRegression(
        max_iter=1000,
        C=0.1,
        random_state=42,
        n_jobs=-1
    )
}

# Cross-validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = {}
test_predictions = {}

for name, model in models.items():
    print(f"\n  Training {name}...")
    fold_scores = []
    fold_predictions = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
        X_train_fold = X_scaled[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_val_fold = X_scaled[val_idx]
        y_val_fold = y.iloc[val_idx]
        
        model_clone = model.__class__(**model.get_params())
        model_clone.fit(X_train_fold, y_train_fold)
        
        val_pred = model_clone.predict(X_val_fold)
        fold_score = accuracy_score(y_val_fold, val_pred)
        fold_scores.append(fold_score)
        
        # Predict on test set
        test_pred = model_clone.predict(X_test_scaled)
        fold_predictions.append(test_pred)
    
    cv_scores[name] = fold_scores
    # Average predictions across folds (majority voting)
    test_predictions[name] = np.array(fold_predictions)
    
    mean_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    print(f"    CV Accuracy: {mean_score:.4f} (+/- {std_score:.4f})")

# Weighted ensemble
print("\n[7/8] Creating weighted ensemble...")
weights = {
    'XGBoost': 0.30,
    'LightGBM': 0.30,
    'RandomForest': 0.20,
    'GradientBoosting': 0.15,
    'LogisticRegression': 0.05
}

# Majority voting with weights
ensemble_predictions = []
for i in range(len(X_test)):
    votes = {}
    for model_name, preds in test_predictions.items():
        # Average predictions across folds for this model
        model_pred = pd.Series([fold_preds[i] for fold_preds in preds]).mode()[0]
        weight = weights[model_name]
        votes[model_pred] = votes.get(model_pred, 0) + weight
    
    # Select class with highest weighted vote
    final_pred = max(votes, key=votes.get)
    ensemble_predictions.append(final_pred)

print("\nModel Performance Summary:")
print("-" * 60)
for name, scores in cv_scores.items():
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"{name:20s}: {mean_score:.4f} (+/- {std_score:.4f})")

ensemble_cv = np.mean([np.mean(scores) for scores in cv_scores.values()])
print(f"{'Ensemble (estimated)':20s}: {ensemble_cv:.4f}")

# Create submission
print("\n[8/8] Creating submission file...")

# Decode predictions back to original labels
ensemble_predictions_decoded = le.inverse_transform(ensemble_predictions)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'drug_category': ensemble_predictions_decoded
})

submission.to_csv('improved_submission.csv', index=False)
print("✓ Submission saved to 'improved_submission.csv'")

print("\n" + "=" * 60)
print("PREDICTION SUMMARY")
print("=" * 60)
pred_counts = submission['drug_category'].value_counts().sort_index()
print("\nPredicted class distribution:")
for cls in pred_counts.index:
    count = pred_counts[cls]
    pct = 100 * count / len(submission)
    print(f"  {cls}: {count} ({pct:.1f}%)")

print("\n✓ COMPLETE - Check 'improved_submission.csv' for results")
print("=" * 60)

IMPROVED DRUG CLASSIFICATION MODEL

[1/8] Loading data...
✓ Train shape: (1500, 14)
✓ Test shape: (377, 13)

[2/8] Handling missing values...
✓ Train missing values: 31
✓ Test missing values: 12

[3/8] Engineering features...
✓ Created 11 new features

[4/8] Preparing data...
✓ Feature matrix shape: (1500, 23)
✓ Target classes: ['Depressants' 'Hallucinogens' 'Stimulants']
✓ Class mapping: {'Depressants': 0, 'Hallucinogens': 1, 'Stimulants': 2}
✓ Class distribution:
  Depressants: 242 (16.1%)
  Hallucinogens: 691 (46.1%)
  Stimulants: 567 (37.8%)

[5/8] Scaling features...
✓ Features scaled

[6/8] Training ensemble models...

  Training XGBoost...
    CV Accuracy: 0.7020 (+/- 0.0206)

  Training LightGBM...
    CV Accuracy: 0.6920 (+/- 0.0183)

  Training RandomForest...
    CV Accuracy: 0.7113 (+/- 0.0201)

  Training GradientBoosting...
    CV Accuracy: 0.7127 (+/- 0.0168)

  Training LogisticRegression...
    CV Accuracy: 0.7093 (+/- 0.0144)

[7/8] Creating weighted ensemble...

Mode