In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV

# Load data

In [None]:
# tabular data
df_train_csv = pd.read_csv('/content/drive/My Drive/child-mind-institute-problematic-internet-use/train.csv')
df_test_csv = pd.read_csv('/content/drive/My Drive/child-mind-institute-problematic-internet-use/test.csv')

In [None]:
# accelerometer (actigraphy) series


# Data Cleaning
1. Tabular Data
2. Accelerometer (actigraphy) Series

## Tabular Data

1. Train

  1. Delete all rows with NA for sii column.

  2. Delete some of columns for train dataset to make sure train dataset features are same as test dataset.

  3. Drop duplicate BMI related columns in train dataset.
  
  4. Convert Fitness_Endurance-Time_Mins and Fitness_Endurance into one column -- Fitness_Endurance-Time(mins) to convenient model training.

  5. Delete all season related columns. It has less correlation with target variable. Besides, there are lots of missing values which cannot be made up. Finally, we deceided to delete these columns.

2. Test

  1. Drop duplicate BMI related columns in train dataset.
  
  2. Convert Fitness_Endurance-Time_Mins and Fitness_Endurance into one column -- Fitness_Endurance-Time(mins) to convenient model training.

  3. Delete all season related columns. It has less correlation with target variable. Besides, there are lots of missing values which cannot be made up. Finally, we deceided to delete these columns.

### Train

In [None]:
# delete all rows with NA for sii column
df_train_csv_filter = df_train_csv[df_train_csv['sii'].notna()]

In [None]:
# Make sure features are same at train dataset and test dataset
common_columns = list(set(df_train_csv_filter.columns) & set(df_test_csv.columns))

common_columns_train = common_columns + ['sii']
df_train_csv_filter = df_train_csv_filter[sorted(common_columns_train)]

df_test_csv = df_test_csv[sorted(common_columns)]

In [None]:
# feature aggregation -- BMI, height, weight
df_train_csv_filter = df_train_csv_filter.drop(columns=['BIA-BIA_BMI', 'Physical-Weight', 'Physical-Height'])

# convert time (mins)
df_train_csv_filter['Fitness_Endurance-Time'] = df_train_csv_filter['Fitness_Endurance-Time_Mins'] + df_train_csv_filter['Fitness_Endurance-Time_Sec']/60
df_train_csv_filter = df_train_csv_filter.drop(columns=['Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec'])

# delete all season columns
season_columns = [col for col in df_train_csv_filter.columns if "Season" in col]
df_train_csv_filter = df_train_csv_filter.drop(columns=season_columns)

### test

In [None]:
# feature aggregation -- BMI, height, weight
df_test_csv = df_test_csv.drop(columns=['BIA-BIA_BMI', 'Physical-Weight', 'Physical-Height'])

# convert time (mins)
df_test_csv['Fitness_Endurance-Time'] = df_test_csv['Fitness_Endurance-Time_Mins'] + df_test_csv['Fitness_Endurance-Time_Sec']/60
df_test_csv = df_test_csv.drop(columns=['Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec'])

# delete all season columns
season_columns = [col for col in df_test_csv.columns if "Season" in col]
df_test_csv = df_test_csv.drop(columns=season_columns)

## Accelerometer (actigraphy) Series

# Final Data Version

Finally, we merged 2 type of data together. Besides, after consideration, created 3 version data and use them for the model training process.

1.

2.

3.

In [None]:
df_train_raw_features=df_train_csv_filter.merge(train_features,on='id')
df_train_raw_stat=df_train_csv_filter.merge(trian_stat,on='id')
df_train_raw_time_series=df_train_csv_filter.merge(train_time_series,on='id')
df_test_raw_features=df_test_csv.merge(test_features,on='id')
df_test_raw_stat=df_test_csv.merge(test_stat,on='id')
df_test_raw_time_series=df_test_csv.merge(test_time_series,on='id')

In [None]:
# df_train_raw_features.to_csv('/content/drive/My Drive/child-mind/Archive/df_train_raw_features.csv')
# df_train_raw_stat.to_csv('/content/drive/My Drive/child-mind/Archive/df_train_raw_stat.csv')
# df_train_raw_time_series.to_csv('/content/drive/My Drive/child-mind/Archive/df_train_raw_time_series.csv')
# df_test_raw_features.to_csv('/content/drive/My Drive/child-mind/Archive/df_test_raw_features.csv')
# df_test_raw_stat.to_csv('/content/drive/My Drive/child-mind/Archive/df_test_raw_stat.csv')
# df_test_raw_time_series.to_csv('/content/drive/My Drive/child-mind/Archive/df_test_raw_time_series.csv')

# Model Training

For each version data, we test below models:
1. Boosting
  1. XGBoost

  2. LightGBM

  3. CatBoost
2. Stacking -> XGBoost + CatBoost

## df_train_raw_features

### Boosting

In [None]:
# Features and target
X = df_train_raw_features.drop(columns=['id', 'Unnamed: 0', 'sii', 'Unnamed: 0.1'], axis=1)
y = df_train_raw_features['sii']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(zip(np.unique(y_train), class_weights))

# XGBoost
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

# LightGBM
lgb_clf = LGBMClassifier(
    objective='multiclass',
    num_class=4,
    class_weight=class_weights_dict,
    random_state=42
)

# CatBoost
cat_clf = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    verbose=100,
    class_weights=class_weights_dict,
    random_state=42
)

In [None]:
# Train the Models
sample_weights = y_train.map(class_weights_dict)
xgb_clf.fit(
    X_train, y_train,
    sample_weight=sample_weights,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True
)
lgb_clf.fit(X_train, y_train)
cat_clf.fit(X_train, y_train, verbose=False)

In [None]:
# Evaluation Metrics

# XGBoost
y_pred_xgb = xgb_clf.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

# LightGBM
y_pred_lgb = lgb_clf.predict(X_test)

accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_lgb))

# CatBoost
y_pred_cat = cat_clf.predict(X_test)

accuracy_cat = accuracy_score(y_test, y_pred_cat)
print(f"CatBoost Accuracy: {accuracy_cat:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_cat))

In [None]:
# # Hyperparameter Tuning

# # XGBoost

# # Define parameter grid
# xgb_param_grid = {
#     'n_estimators': [100, 300, 500],
#     'max_depth': [3, 6, 9],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0]
# }

# # Initialize GridSearchCV
# xgb_grid = GridSearchCV(
#     estimator=xgb_clf,
#     param_grid=xgb_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# # LightGBM
# lgb_param_grid = {
#     'n_estimators': [100, 300, 500],
#     'num_leaves': [31, 50, 100],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0]
# }


# lgb_grid = GridSearchCV(
#     estimator=lgb_clf,
#     param_grid=lgb_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# lgb_grid.fit(X_train, y_train)

# print("LightGBM Best Parameters:", lgb_grid.best_params_)
# print("LightGBM Best CV Accuracy:", lgb_grid.best_score_)

# y_pred_lgb_best = lgb_grid.best_estimator_.predict(X_test)
# accuracy_lgb_best = accuracy_score(y_test, y_pred_lgb_best)
# print(f"LightGBM Best Accuracy: {accuracy_lgb_best:.4f}")
# print("Classification Report:\n", classification_report(y_test, y_pred_lgb_best))

# # CatBoost

# cat_param_grid = {
#     'depth': [4, 6, 8],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'iterations': [500, 1000],
#     'l2_leaf_reg': [1, 3, 5]
# }

# cat_grid = GridSearchCV(
#     estimator=cat_clf,
#     param_grid=cat_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# cat_grid.fit(X_train, y_train)

# print("CatBoost Best Parameters:", cat_grid.best_params_)
# print("CatBoost Best CV Accuracy:", cat_grid.best_score_)

# y_pred_cat_best = cat_grid.best_estimator_.predict(X_test)
# accuracy_cat_best = accuracy_score(y_test, y_pred_cat_best)
# print(f"CatBoost Best Accuracy: {accuracy_cat_best:.4f}")
# print("Classification Report:\n", classification_report(y_test, y_pred_cat_best))

### Stacking

In [None]:
threshold = 0.7
cols_before = df_train_raw_features.shape[1]
df_train_raw_features = df_train_raw_features.loc[:, df_train_raw_features.isnull().mean() < (1 - threshold)]
cols_after = df_train_raw_features.shape[1]

# # XGBoost
# xgb_clf = XGBClassifier(
#     objective='multi:softprob',
#     num_class=len(classes),
#     use_label_encoder=False,
#     eval_metric='mlogloss',
#     random_state=42
# )

# # CatBoost
# cat_clf = CatBoostClassifier(
#     iterations=1000,
#     learning_rate=0.1,
#     depth=6,
#     loss_function='MultiClass',
#     verbose=100,
#     class_weights=class_weights_dict,
#     random_seed=42
# )

meta_clf = LogisticRegression(
    solver='lbfgs',
    multi_class='multinomial',
    max_iter=1000,
    random_state=42
)

base_estimators = [
    ('xgb', xgb_clf),
    ('cat', cat_clf)
]

stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_clf,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    passthrough=False,
    n_jobs=-1
)

stacking_clf.fit(X_train, y_train)

y_pred_stacking = stacking_clf.predict(X_test)
print("stacking classification reporting:\n", classification_report(y_test, y_pred_stacking, target_names=[f"类别{cls}" for cls in classes]))


## df_train_raw_stat

### Boosting

In [None]:
# Features and target
X = df_train_raw_stat.drop(columns=['id', 'Unnamed: 0', 'sii', 'Unnamed: 0.1'], axis=1)
y = df_train_raw_stat['sii']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(zip(np.unique(y_train), class_weights))

# XGBoost
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

# LightGBM
lgb_clf = LGBMClassifier(
    objective='multiclass',
    num_class=4,
    class_weight=class_weights_dict,
    random_state=42
)

# CatBoost
cat_clf = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    verbose=100,
    class_weights=class_weights_dict,
    random_state=42
)

In [None]:
# Train the Models
sample_weights = y_train.map(class_weights_dict)
xgb_clf.fit(
    X_train, y_train,
    sample_weight=sample_weights,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True
)
lgb_clf.fit(X_train, y_train)
cat_clf.fit(X_train, y_train, verbose=False)

In [None]:
# Evaluation Metrics

# XGBoost
y_pred_xgb = xgb_clf.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

# LightGBM
y_pred_lgb = lgb_clf.predict(X_test)

accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_lgb))

# CatBoost
y_pred_cat = cat_clf.predict(X_test)

accuracy_cat = accuracy_score(y_test, y_pred_cat)
print(f"CatBoost Accuracy: {accuracy_cat:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_cat))

In [None]:
# # Hyperparameter Tuning

# # XGBoost

# # Define parameter grid
# xgb_param_grid = {
#     'n_estimators': [100, 300, 500],
#     'max_depth': [3, 6, 9],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0]
# }

# # Initialize GridSearchCV
# xgb_grid = GridSearchCV(
#     estimator=xgb_clf,
#     param_grid=xgb_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# # LightGBM
# lgb_param_grid = {
#     'n_estimators': [100, 300, 500],
#     'num_leaves': [31, 50, 100],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0]
# }


# lgb_grid = GridSearchCV(
#     estimator=lgb_clf,
#     param_grid=lgb_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# lgb_grid.fit(X_train, y_train)

# print("LightGBM Best Parameters:", lgb_grid.best_params_)
# print("LightGBM Best CV Accuracy:", lgb_grid.best_score_)

# y_pred_lgb_best = lgb_grid.best_estimator_.predict(X_test)
# accuracy_lgb_best = accuracy_score(y_test, y_pred_lgb_best)
# print(f"LightGBM Best Accuracy: {accuracy_lgb_best:.4f}")
# print("Classification Report:\n", classification_report(y_test, y_pred_lgb_best))

# # CatBoost

# cat_param_grid = {
#     'depth': [4, 6, 8],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'iterations': [500, 1000],
#     'l2_leaf_reg': [1, 3, 5]
# }

# cat_grid = GridSearchCV(
#     estimator=cat_clf,
#     param_grid=cat_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# cat_grid.fit(X_train, y_train)

# print("CatBoost Best Parameters:", cat_grid.best_params_)
# print("CatBoost Best CV Accuracy:", cat_grid.best_score_)

# y_pred_cat_best = cat_grid.best_estimator_.predict(X_test)
# accuracy_cat_best = accuracy_score(y_test, y_pred_cat_best)
# print(f"CatBoost Best Accuracy: {accuracy_cat_best:.4f}")
# print("Classification Report:\n", classification_report(y_test, y_pred_cat_best))

### Stacking

In [None]:
threshold = 0.7
cols_before = df_train_raw_stat.shape[1]
df_train_raw_stat = df_train_raw_stat.loc[:, df_train_raw_stat.isnull().mean() < (1 - threshold)]
cols_after = df_train_raw_stat.shape[1]

# # XGBoost
# xgb_clf = XGBClassifier(
#     objective='multi:softprob',
#     num_class=len(classes),
#     use_label_encoder=False,
#     eval_metric='mlogloss',
#     random_state=42
# )

# # CatBoost
# cat_clf = CatBoostClassifier(
#     iterations=1000,
#     learning_rate=0.1,
#     depth=6,
#     loss_function='MultiClass',
#     verbose=100,
#     class_weights=class_weights_dict,
#     random_seed=42
# )

meta_clf = LogisticRegression(
    solver='lbfgs',
    multi_class='multinomial',
    max_iter=1000,
    random_state=42
)

base_estimators = [
    ('xgb', xgb_clf),
    ('cat', cat_clf)
]

stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_clf,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    passthrough=False,
    n_jobs=-1
)

stacking_clf.fit(X_train, y_train)

y_pred_stacking = stacking_clf.predict(X_test)
print("stacking classification reporting:\n", classification_report(y_test, y_pred_stacking, target_names=[f"类别{cls}" for cls in classes]))


## df_train_raw_time_series

### Boosting

In [None]:
# Features and target
X = df_train_raw_time_series.drop(columns=['id', 'Unnamed: 0', 'sii', 'Unnamed: 0.1'], axis=1)
y = df_train_raw_time_series['sii']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(zip(np.unique(y_train), class_weights))

# XGBoost
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

# LightGBM
lgb_clf = LGBMClassifier(
    objective='multiclass',
    num_class=4,
    class_weight=class_weights_dict,
    random_state=42
)

# CatBoost
cat_clf = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    verbose=100,
    class_weights=class_weights_dict,
    random_state=42
)

In [None]:
# Train the Models
sample_weights = y_train.map(class_weights_dict)
xgb_clf.fit(
    X_train, y_train,
    sample_weight=sample_weights,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True
)
lgb_clf.fit(X_train, y_train)
cat_clf.fit(X_train, y_train, verbose=False)

In [None]:
# Evaluation Metrics

# XGBoost
y_pred_xgb = xgb_clf.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

# LightGBM
y_pred_lgb = lgb_clf.predict(X_test)

accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_lgb))

# CatBoost
y_pred_cat = cat_clf.predict(X_test)

accuracy_cat = accuracy_score(y_test, y_pred_cat)
print(f"CatBoost Accuracy: {accuracy_cat:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_cat))

In [None]:
# # Hyperparameter Tuning

# # XGBoost

# # Define parameter grid
# xgb_param_grid = {
#     'n_estimators': [100, 300, 500],
#     'max_depth': [3, 6, 9],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0]
# }

# # Initialize GridSearchCV
# xgb_grid = GridSearchCV(
#     estimator=xgb_clf,
#     param_grid=xgb_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# # LightGBM
# lgb_param_grid = {
#     'n_estimators': [100, 300, 500],
#     'num_leaves': [31, 50, 100],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0]
# }


# lgb_grid = GridSearchCV(
#     estimator=lgb_clf,
#     param_grid=lgb_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# lgb_grid.fit(X_train, y_train)

# print("LightGBM Best Parameters:", lgb_grid.best_params_)
# print("LightGBM Best CV Accuracy:", lgb_grid.best_score_)

# y_pred_lgb_best = lgb_grid.best_estimator_.predict(X_test)
# accuracy_lgb_best = accuracy_score(y_test, y_pred_lgb_best)
# print(f"LightGBM Best Accuracy: {accuracy_lgb_best:.4f}")
# print("Classification Report:\n", classification_report(y_test, y_pred_lgb_best))

# # CatBoost

# cat_param_grid = {
#     'depth': [4, 6, 8],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'iterations': [500, 1000],
#     'l2_leaf_reg': [1, 3, 5]
# }

# cat_grid = GridSearchCV(
#     estimator=cat_clf,
#     param_grid=cat_param_grid,
#     scoring='accuracy',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# cat_grid.fit(X_train, y_train)

# print("CatBoost Best Parameters:", cat_grid.best_params_)
# print("CatBoost Best CV Accuracy:", cat_grid.best_score_)

# y_pred_cat_best = cat_grid.best_estimator_.predict(X_test)
# accuracy_cat_best = accuracy_score(y_test, y_pred_cat_best)
# print(f"CatBoost Best Accuracy: {accuracy_cat_best:.4f}")
# print("Classification Report:\n", classification_report(y_test, y_pred_cat_best))

### Stacking

In [None]:
threshold = 0.7
cols_before = df_train_raw_time_series.shape[1]
df_train_raw_time_series = df_train_raw_time_series.loc[:, df_train_raw_time_series.isnull().mean() < (1 - threshold)]
cols_after = df_train_raw_time_series.shape[1]

# # XGBoost
# xgb_clf = XGBClassifier(
#     objective='multi:softprob',
#     num_class=len(classes),
#     use_label_encoder=False,
#     eval_metric='mlogloss',
#     random_state=42
# )

# # CatBoost
# cat_clf = CatBoostClassifier(
#     iterations=1000,
#     learning_rate=0.1,
#     depth=6,
#     loss_function='MultiClass',
#     verbose=100,
#     class_weights=class_weights_dict,
#     random_seed=42
# )

meta_clf = LogisticRegression(
    solver='lbfgs',
    multi_class='multinomial',
    max_iter=1000,
    random_state=42
)

base_estimators = [
    ('xgb', xgb_clf),
    ('cat', cat_clf)
]

stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_clf,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    passthrough=False,
    n_jobs=-1
)

stacking_clf.fit(X_train, y_train)

y_pred_stacking = stacking_clf.predict(X_test)
print("stacking classification reporting:\n", classification_report(y_test, y_pred_stacking, target_names=[f"类别{cls}" for cls in classes]))

## Test Data

After comparison, we found xx model's performance best. So, we use XX on test dataset.

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

test_data = pd.read_csv("df_train_raw_features.csv")
id=test_data['id']

test_data = test_data.drop(columns=['id', 'Unnamed: 0.1', 'Unnamed: 0'], errors='ignore')
test_data= test_data.drop(columns=high_missing_cols)
test_data = test_data.drop(columns=['sii'])

test_data_imputed = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)

cat_features = test_data_imputed.select_dtypes(include=['object']).columns
for col in cat_features:
    test_data_imputed[col] = LabelEncoder().fit_transform(test_data_imputed[col])

num_features = test_data_imputed.select_dtypes(include=['float64', 'int64']).columns
test_data_scaled = pd.DataFrame(scaler.transform(test_data_imputed), columns=test_data_imputed.columns)
print("Test data cleaned successfully!")


In [None]:
test_preds = stacking_model.predict(test_data_scaled)

test_data['sii_prediction'] = test_preds

In [None]:
result_df = pd.DataFrame({
    'id': id,
    'sii_prediction': test_data['sii_prediction']
})

print(result_df.head())

In [None]:
result_df.to_csv("final_predictions.csv", index=False)
print("Final DataFrame saved to 'final_predictions.csv'")