In [9]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras import layers, models
from scikeras.wrappers import KerasClassifier
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'scikeras'

In [None]:
# Cell 2: Load Data
train_data = pd.read_csv('data_minihackathon_train.csv')
test_data = pd.read_csv('data_minihackathon_test.csv')

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print("\nMissing values in training data:")
print(train_data.isnull().sum())

In [None]:
# Cell 3: Define Mapping for New Attribute Values (Based on Image)
age_map = {
    -0.95197: -3.46436,  # 18-24
    -0.07854: -2.75696,  # 25-34
    0.49788: -1.86962,   # 35-44
    1.09449: -0.92104,   # 45-54
    1.82213: 0.13606,    # 55-64
    2.59171: 1.60383     # 65+
}
gender_map = {
    0.48246: 0.48246,    # Female
    -0.48246: -0.48246   # Male
}
education_map = {
    -2.43591: -2.43591,  # Left School Before 16
    -1.73790: -1.73790,  # Left School at 16
    -1.43719: -1.43719,  # Left School at 17
    -1.22751: -1.22751,  # Left School at 18
    -0.61113: -0.61113,  # Some College, No Certificate Or Degree
    -0.05921: -0.05921,  # Professional Certificate/Diploma
    0.45468: 0.45468,    # University Degree
    1.16365: 1.16365,    # Masters Degree
    1.98437: 1.98437     # Doctorate Degree
}

# Apply mapping to train and test data
train_data['Age'] = train_data['Age'].map(age_map).fillna(train_data['Age'])
train_data['Gender'] = train_data['Gender'].map(gender_map).fillna(train_data['Gender'])
train_data['Education'] = train_data['Education'].map(education_map).fillna(train_data['Education'])
test_data['Age'] = test_data['Age'].map(age_map).fillna(test_data['Age'])
test_data['Gender'] = test_data['Gender'].map(gender_map).fillna(test_data['Gender'])
test_data['Education'] = test_data['Education'].map(education_map).fillna(test_data['Education'])

In [None]:
# Cell 4: Drop Irrelevant Features
features_to_drop = ['Country', 'Ethnicity', 'ID']
X_train = train_data.drop(['drug_category'] + features_to_drop, axis=1)
y_train = train_data['drug_category']
X_test = test_data.drop(['ID'] + features_to_drop, axis=1)

print("\nFeatures after dropping:")
print(X_train.columns)

In [None]:
# Cell 5: Handle Missing Values
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

print("\nData shape after imputation:")
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")

In [None]:
# Cell 6: Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nShape after SMOTE:")
print(f"X_train_resampled shape: {X_train_resampled.shape}, y_train_resampled shape: {y_train_resampled.shape}")

In [None]:
# Cell 7: Encode Target Variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_resampled)

# Split into train and validation sets
X_train_split, X_val, y_train_encoded_split, y_val = train_test_split(
    X_train_resampled, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded
)

print(f"Train split shape: {X_train_split.shape}, Val shape: {X_val.shape}")

In [None]:
# Cell 8: Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("\nData scaled successfully.")

In [None]:
# Cell 9: Parameter Tuning with GridSearchCV
def create_model(learning_rate=0.001, layer1_units=64, layer2_units=32, dropout_rate=0.2):
    model = models.Sequential([
        layers.Dense(layer1_units, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(dropout_rate),
        layers.Dense(layer2_units, activation='relu'),
        layers.Dropout(dropout_rate),
        layers.Dense(3, activation='softmax')  # 3 classes: Depressants, Hallucinogens, Stimulants
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

model = KerasClassifier(model=create_model, loss='sparse_categorical_crossentropy', metrics=['accuracy'], verbose=0)

param_grid = {
    'batch_size': [32, 64],
    'epochs': [50],
    'learning_rate': [0.001, 0.01],
    'layer1_units': [64, 128],
    'layer2_units': [32, 64],
    'dropout_rate': [0.2, 0.3]
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_result = grid.fit(X_train_scaled, y_train_encoded_split)

print("\nBest parameters:", grid_result.best_params_)
print("Best cross-validation score:", grid_result.best_score_)

In [None]:
# Cell 10: Train Best Model
best_model = create_model(
    learning_rate=grid_result.best_params_['learning_rate'],
    layer1_units=grid_result.best_params_['layer1_units'],
    layer2_units=grid_result.best_params_['layer2_units'],
    dropout_rate=grid_result.best_params_['dropout_rate']
)

history = best_model.fit(
    X_train_scaled, y_train_encoded_split,
    validation_data=(X_val_scaled, y_val),
    batch_size=grid_result.best_params_['batch_size'],
    epochs=grid_result.best_params_['epochs'],
    verbose=1
)

# Evaluate on validation set
val_loss, val_accuracy = best_model.evaluate(X_val_scaled, y_val, verbose=0)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")