In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report

# Load datasets
metadata = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata.csv")
signals = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals.csv")
metadata_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_test.csv")
signals_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_test.csv")
metadata_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_kaggle.csv")
signals_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_kaggle.csv")

# Feature engineering
def create_features(df):
    grouped = df.groupby("user_snippet")
    feature_df = grouped.agg({
        "x-axis": ['mean', 'std', 'max', 'min', 'median', 'skew', 'sum'],
        "y-axis": ['mean', 'std', 'max', 'min', 'median', 'skew', 'sum'],
        "z-axis": ['mean', 'std', 'max', 'min', 'median', 'skew', 'sum'],
        "timestamp": ['count']
    })
    feature_df.columns = ['_'.join(col).strip() for col in feature_df.columns.values]
    feature_df.reset_index(inplace=True)

    # Extra derived features
    feature_df["mag_mean"] = np.sqrt(
        feature_df["x-axis_mean"]**2 +
        feature_df["y-axis_mean"]**2 +
        feature_df["z-axis_mean"]**2
    )
    feature_df["total_energy"] = np.sqrt(
        feature_df["x-axis_sum"]**2 +
        feature_df["y-axis_sum"]**2 +
        feature_df["z-axis_sum"]**2
    )
    feature_df["x_y_ratio"] = feature_df["x-axis_mean"] / (feature_df["y-axis_mean"] + 1e-5)
    feature_df["x_z_ratio"] = feature_df["x-axis_mean"] / (feature_df["z-axis_mean"] + 1e-5)
    feature_df["y_z_ratio"] = feature_df["y-axis_mean"] / (feature_df["z-axis_mean"] + 1e-5)

    return feature_df

# Create features for all datasets
train_features = create_features(signals)
val_features = create_features(signals_test)
kaggle_features = create_features(signals_kaggle)

# Merge with metadata
train_df = metadata.merge(train_features, on="user_snippet")
val_df = metadata_test.merge(val_features, on="user_snippet")
kaggle_df = metadata_kaggle.merge(kaggle_features, on="user_snippet")

# Encode labels
label_encoder = LabelEncoder()
train_df["activity_encoded"] = label_encoder.fit_transform(train_df["activity"])
val_df["activity_encoded"] = label_encoder.transform(val_df["activity"])

# Select features and targets
X_train = train_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_train = train_df["activity_encoded"]
X_val = val_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_val = val_df["activity_encoded"]
X_kaggle = kaggle_df.drop(columns=["user_snippet"])

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_kaggle_scaled = scaler.transform(X_kaggle)

# Define base models
rf = RandomForestClassifier(n_estimators=15, min_samples_split=2, min_samples_leaf=4,
                            max_depth=5, max_features='log2', random_state=42)
gbm = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, random_state=42)

# Define the stacking ensemble
stacked_model = StackingClassifier(
    estimators=[('rf', rf), ('gbm', gbm)],
    final_estimator=LogisticRegression(max_iter=1000, penalty='l2', C=0.1),
    cv=5,
    n_jobs=-1
)

# Train model
stacked_model.fit(X_train_scaled, y_train)

# Evaluate on validation set
y_val_pred = stacked_model.predict(X_val_scaled)
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

# Predict on Kaggle test set
y_kaggle_pred = stacked_model.predict(X_kaggle_scaled)
kaggle_labels = label_encoder.inverse_transform(y_kaggle_pred)

# Create Kaggle output
kaggle_output = metadata_kaggle[["user_snippet"]].copy()
kaggle_output["predicted_activity"] = kaggle_labels
kaggle_output.to_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/rf+gbm_2predictions.csv", index=False)

print(kaggle_output.head())


              precision    recall  f1-score   support

  Downstairs       0.70      0.28      0.40       174
     Jogging       0.96      0.98      0.97       689
     Sitting       0.69      1.00      0.81        22
    Standing       1.00      0.74      0.85        43
    Upstairs       0.82      0.34      0.49       238
     Walking       0.76      0.99      0.86       768

    accuracy                           0.84      1934
   macro avg       0.82      0.72      0.73      1934
weighted avg       0.84      0.84      0.81      1934

  user_snippet predicted_activity
0       8054_0            Walking
1       8054_1            Sitting
2       8054_2            Walking
3       8054_3           Standing
4       8054_4            Walking


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import iqr, kurtosis, skew

# Load datasets
metadata = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata.csv")
signals = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals.csv")
metadata_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_test.csv")
signals_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_test.csv")
metadata_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_kaggle.csv")
signals_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_kaggle.csv")

# Feature engineering
def create_features(df):
    grouped = df.groupby("user_snippet")
    feature_df = pd.DataFrame()

    for snippet, group in grouped:
        row = {'user_snippet': snippet}
        for axis in ['x-axis', 'y-axis', 'z-axis']:
            values = group[axis].values
            row[f'{axis}_mean'] = np.mean(values)
            row[f'{axis}_std'] = np.std(values)
            row[f'{axis}_max'] = np.max(values)
            row[f'{axis}_min'] = np.min(values)
            row[f'{axis}_median'] = np.median(values)
            row[f'{axis}_iqr'] = iqr(values)
            row[f'{axis}_kurtosis'] = kurtosis(values)
            row[f'{axis}_skew'] = skew(values)
            row[f'{axis}_sum'] = np.sum(values)
            row[f'{axis}_range'] = np.ptp(values)
        row["timestamp_count"] = group["timestamp"].count()

        # Derived features
        row["mag_mean"] = np.sqrt(
            row['x-axis_mean']**2 +
            row['y-axis_mean']**2 +
            row['z-axis_mean']**2
        )
        row["total_energy"] = np.sqrt(
            row['x-axis_sum']**2 +
            row['y-axis_sum']**2 +
            row['z-axis_sum']**2
        )
        row["x_y_ratio"] = row['x-axis_mean'] / (row['y-axis_mean'] + 1e-5)
        row["x_z_ratio"] = row['x-axis_mean'] / (row['z-axis_mean'] + 1e-5)
        row["y_z_ratio"] = row['y-axis_mean'] / (row['z-axis_mean'] + 1e-5)

        feature_df = pd.concat([feature_df, pd.DataFrame([row])], ignore_index=True)

    return feature_df

# Create features
train_features = create_features(signals)
val_features = create_features(signals_test)
kaggle_features = create_features(signals_kaggle)

# Merge with metadata
train_df = metadata.merge(train_features, on="user_snippet")
val_df = metadata_test.merge(val_features, on="user_snippet")
kaggle_df = metadata_kaggle.merge(kaggle_features, on="user_snippet")

# Encode target labels
label_encoder = LabelEncoder()
train_df["activity_encoded"] = label_encoder.fit_transform(train_df["activity"])
val_df["activity_encoded"] = label_encoder.transform(val_df["activity"])

# Split features and targets
X_train = train_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_train = train_df["activity_encoded"]
X_val = val_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_val = val_df["activity_encoded"]
X_kaggle = kaggle_df.drop(columns=["user_snippet"])

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_kaggle_scaled = scaler.transform(X_kaggle)

# Base models
rf = RandomForestClassifier(random_state=42)
gbm = GradientBoostingClassifier(random_state=42)

# Meta-model
meta_model = LogisticRegression(max_iter=1000, random_state=42)

# Stacking classifier (without fitting yet)
stack = StackingClassifier(
    estimators=[('rf', rf), ('gbm', gbm)],
    final_estimator=meta_model,
    passthrough=True,
    n_jobs=-1
)

# Parameter grid for tuning
param_grid = {
    'rf__n_estimators': [10, 50, 100, 200],
    'rf__max_depth': [3, 5, 10, None],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'gbm__n_estimators': [100, 200, 300],
    'gbm__learning_rate': [0.01, 0.05, 0.1],
    'gbm__max_depth': [3, 5, 7],
    'final_estimator__C': [0.01, 0.1, 1, 10],
    'final_estimator__penalty': ['l2']
}

# Randomized search
random_search = RandomizedSearchCV(
    stack,
    param_distributions=param_grid,
    n_iter=30,
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X_train_scaled, y_train)

# Evaluate
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

# Predict on Kaggle
y_kaggle_pred = best_model.predict(X_kaggle_scaled)
kaggle_labels = label_encoder.inverse_transform(y_kaggle_pred)

# Save submission
submission = metadata_kaggle[["user_snippet"]].copy()
submission["predicted_activity"] = kaggle_labels
submission.to_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/rf+gbm_3_predictions.csv", index=False)

print(submission.head())


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END final_estimator__C=1, final_estimator__penalty=l2, gbm__learning_rate=0.01, gbm__max_depth=3, gbm__n_estimators=100, rf__max_depth=3, rf__max_features=sqrt, rf__min_samples_leaf=2, rf__n_estimators=200; total time= 3.0min
[CV] END final_estimator__C=1, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=5, gbm__n_estimators=100, rf__max_depth=10, rf__max_features=log2, rf__min_samples_leaf=4, rf__n_estimators=100; total time= 4.8min
[CV] END final_estimator__C=1, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=5, gbm__n_estimators=100, rf__max_depth=10, rf__max_features=log2, rf__min_samples_leaf=4, rf__n_estimators=100; total time= 4.9min
[CV] END final_estimator__C=1, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=5, gbm__n_estimators=100, rf__max_depth=10, rf__max_features=log2, rf__min_samples_leaf=4, rf__n_estimators=100; total time= 4.9min
[CV] END final_

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import iqr, kurtosis, skew

# Load data
metadata = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata.csv")
signals = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals.csv")
metadata_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_test.csv")
signals_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_test.csv")
metadata_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_kaggle.csv")
signals_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_kaggle.csv")

# Feature engineering
def create_features(df):
    grouped = df.groupby("user_snippet")
    feature_df = pd.DataFrame()

    for snippet, group in grouped:
        row = {'user_snippet': snippet}
        for axis in ['x-axis', 'y-axis', 'z-axis']:
            values = group[axis].values
            row[f'{axis}_mean'] = np.mean(values)
            row[f'{axis}_std'] = np.std(values)
            row[f'{axis}_max'] = np.max(values)
            row[f'{axis}_min'] = np.min(values)
            row[f'{axis}_median'] = np.median(values)
            row[f'{axis}_iqr'] = iqr(values)
            row[f'{axis}_kurtosis'] = kurtosis(values)
            row[f'{axis}_skew'] = skew(values)
            row[f'{axis}_sum'] = np.sum(values)
            row[f'{axis}_range'] = np.ptp(values)
        row["timestamp_count"] = group["timestamp"].count()

        # Derived features
        row["mag_mean"] = np.sqrt(
            row['x-axis_mean']**2 + row['y-axis_mean']**2 + row['z-axis_mean']**2
        )
        row["total_energy"] = np.sqrt(
            row['x-axis_sum']**2 + row['y-axis_sum']**2 + row['z-axis_sum']**2
        )
        row["x_y_ratio"] = row['x-axis_mean'] / (row['y-axis_mean'] + 1e-5)
        row["x_z_ratio"] = row['x-axis_mean'] / (row['z-axis_mean'] + 1e-5)
        row["y_z_ratio"] = row['y-axis_mean'] / (row['z-axis_mean'] + 1e-5)

        feature_df = pd.concat([feature_df, pd.DataFrame([row])], ignore_index=True)

    return feature_df

# Generate features
train_features = create_features(signals)
val_features = create_features(signals_test)
kaggle_features = create_features(signals_kaggle)

# Merge with metadata
train_df = metadata.merge(train_features, on="user_snippet")
val_df = metadata_test.merge(val_features, on="user_snippet")
kaggle_df = metadata_kaggle.merge(kaggle_features, on="user_snippet")

# Encode labels
label_encoder = LabelEncoder()
train_df["activity_encoded"] = label_encoder.fit_transform(train_df["activity"])
val_df["activity_encoded"] = label_encoder.transform(val_df["activity"])

# Extract features and targets
X_train = train_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_train = train_df["activity_encoded"]
X_val = val_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_val = val_df["activity_encoded"]
X_kaggle = kaggle_df.drop(columns=["user_snippet"])

# Scale features
scaler = StandardScaler()
X_train_scaled_full = scaler.fit_transform(X_train)
X_val_scaled_full = scaler.transform(X_val)
X_kaggle_scaled_full = scaler.transform(X_kaggle)

# === FEATURE SELECTION ===
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X_train_scaled_full, y_train)

# Select top N features
top_n = 40
importances = rf_selector.feature_importances_
indices = np.argsort(importances)[::-1][:top_n]
selected_columns = X_train.columns[indices]

# Reduce to selected features
X_train_selected = X_train[selected_columns]
X_val_selected = X_val[selected_columns]
X_kaggle_selected = X_kaggle[selected_columns]

# Re-scale selected features
X_train_scaled = scaler.fit_transform(X_train_selected)
X_val_scaled = scaler.transform(X_val_selected)
X_kaggle_scaled = scaler.transform(X_kaggle_selected)

# Base models
rf = RandomForestClassifier(random_state=42)
gbm = GradientBoostingClassifier(random_state=42)

# Meta model
meta_model = LogisticRegression(max_iter=1000, random_state=42)

# Define stacking model
stack = StackingClassifier(
    estimators=[('rf', rf), ('gbm', gbm)],
    final_estimator=meta_model,
    passthrough=True,
    n_jobs=-1
)

# Random search grid
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10, None],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'gbm__n_estimators': [100, 200, 300],
    'gbm__learning_rate': [0.01, 0.05, 0.1],
    'gbm__max_depth': [3, 5, 7],
    'final_estimator__C': [0.01, 0.1, 1, 10],
    'final_estimator__penalty': ['l2']
}

# Run randomized search
random_search = RandomizedSearchCV(
    stack,
    param_distributions=param_grid,
    n_iter=15,
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Train
random_search.fit(X_train_scaled, y_train)

# Evaluate
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

# Predict Kaggle
y_kaggle_pred = best_model.predict(X_kaggle_scaled)
kaggle_labels = label_encoder.inverse_transform(y_kaggle_pred)

# Save results
submission = metadata_kaggle[["user_snippet"]].copy()
submission["predicted_activity"] = kaggle_labels
submission.to_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/rf+gbm2_selected_features_predictions.csv", index=False)

print("Saved predictions to 'stacking_with_selection_kaggle_predictions.csv'")


Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=3, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=200; total time= 2.1min
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=3, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=200; total time= 2.1min
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=3, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=200; total time= 2.1min
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.05, gbm__max_depth=7, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=50; total time= 4.6min
[CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import iqr, kurtosis, skew

# Load data
metadata = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata.csv")
signals = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals.csv")
metadata_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_test.csv")
signals_test = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_test.csv")
metadata_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/metadata_kaggle.csv")
signals_kaggle = pd.read_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/signals_kaggle.csv")

# Feature engineering
def create_features(df):
    grouped = df.groupby("user_snippet")
    feature_df = pd.DataFrame()

    for snippet, group in grouped:
        row = {'user_snippet': snippet}
        for axis in ['x-axis', 'y-axis', 'z-axis']:
            values = group[axis].values
            row[f'{axis}_mean'] = np.mean(values)
            row[f'{axis}_std'] = np.std(values)
            row[f'{axis}_max'] = np.max(values)
            row[f'{axis}_min'] = np.min(values)
            row[f'{axis}_median'] = np.median(values)
            row[f'{axis}_iqr'] = iqr(values)
            row[f'{axis}_kurtosis'] = kurtosis(values)
            row[f'{axis}_skew'] = skew(values)
            row[f'{axis}_sum'] = np.sum(values)
            row[f'{axis}_range'] = np.ptp(values)
        row["timestamp_count"] = group["timestamp"].count()

        # Derived features
        row["mag_mean"] = np.sqrt(
            row['x-axis_mean']**2 + row['y-axis_mean']**2 + row['z-axis_mean']**2
        )
        row["total_energy"] = np.sqrt(
            row['x-axis_sum']**2 + row['y-axis_sum']**2 + row['z-axis_sum']**2
        )
        row["x_y_ratio"] = row['x-axis_mean'] / (row['y-axis_mean'] + 1e-5)
        row["x_z_ratio"] = row['x-axis_mean'] / (row['z-axis_mean'] + 1e-5)
        row["y_z_ratio"] = row['y-axis_mean'] / (row['z-axis_mean'] + 1e-5)

        feature_df = pd.concat([feature_df, pd.DataFrame([row])], ignore_index=True)

    return feature_df

# Generate features
train_features = create_features(signals)
val_features = create_features(signals_test)
kaggle_features = create_features(signals_kaggle)

# Merge with metadata
train_df = metadata.merge(train_features, on="user_snippet")
val_df = metadata_test.merge(val_features, on="user_snippet")
kaggle_df = metadata_kaggle.merge(kaggle_features, on="user_snippet")

# Encode labels
label_encoder = LabelEncoder()
train_df["activity_encoded"] = label_encoder.fit_transform(train_df["activity"])
val_df["activity_encoded"] = label_encoder.transform(val_df["activity"])

# Extract features and targets
X_train = train_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_train = train_df["activity_encoded"]
X_val = val_df.drop(columns=["user_snippet", "activity", "activity_encoded"])
y_val = val_df["activity_encoded"]
X_kaggle = kaggle_df.drop(columns=["user_snippet"])

# Scale features
scaler = StandardScaler()
X_train_scaled_full = scaler.fit_transform(X_train)
X_val_scaled_full = scaler.transform(X_val)
X_kaggle_scaled_full = scaler.transform(X_kaggle)

# === FEATURE SELECTION ===
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X_train_scaled_full, y_train)

# Select top N features
top_n = 30
importances = rf_selector.feature_importances_
indices = np.argsort(importances)[::-1][:top_n]
selected_columns = X_train.columns[indices]

# Reduce to selected features
X_train_selected = X_train[selected_columns]
X_val_selected = X_val[selected_columns]
X_kaggle_selected = X_kaggle[selected_columns]

# Re-scale selected features
X_train_scaled = scaler.fit_transform(X_train_selected)
X_val_scaled = scaler.transform(X_val_selected)
X_kaggle_scaled = scaler.transform(X_kaggle_selected)

# Base models
rf = RandomForestClassifier(random_state=42)
gbm = GradientBoostingClassifier(random_state=42)

# Meta model
meta_model = LogisticRegression(max_iter=1000, random_state=42)

# Define stacking model
stack = StackingClassifier(
    estimators=[('rf', rf), ('gbm', gbm)],
    final_estimator=meta_model,
    passthrough=True,
    n_jobs=-1
)

# Random search grid
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10, None],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'gbm__n_estimators': [100, 200, 300],
    'gbm__learning_rate': [0.01, 0.05, 0.1],
    'gbm__max_depth': [3, 5, 7],
    'final_estimator__C': [0.01, 0.1, 1, 10],
    'final_estimator__penalty': ['l2']
}

# Run randomized search
random_search = RandomizedSearchCV(
    stack,
    param_distributions=param_grid,
    n_iter=15,
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Train
random_search.fit(X_train_scaled, y_train)

# Evaluate
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

# Predict Kaggle
y_kaggle_pred = best_model.predict(X_kaggle_scaled)
kaggle_labels = label_encoder.inverse_transform(y_kaggle_pred)

# Save results
submission = metadata_kaggle[["user_snippet"]].copy()
submission["predicted_activity"] = kaggle_labels
submission.to_csv("/Users/dawidnawrocki/Documents/Machine Learning/7021datsci-challenge-2025/rf+gbm3_selected_features_predictions.csv", index=False)

print("Saved predictions to 'stacking_with_selection_kaggle_predictions.csv'")


Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=3, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=200; total time= 1.6min
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=3, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=200; total time= 1.6min
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.1, gbm__max_depth=3, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=200; total time= 1.6min
[CV] END final_estimator__C=10, final_estimator__penalty=l2, gbm__learning_rate=0.05, gbm__max_depth=7, gbm__n_estimators=100, rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_leaf=4, rf__n_estimators=50; total time= 3.4min
[CV