In [92]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.utils import class_weight

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

import optuna
from optuna.integration import XGBoostPruningCallback

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


# Load data & data cleaning

In [None]:
# tabular data
df_train_csv = pd.read_csv('/Users/brenda/Downloads/child-mind-institute-problematic-internet-use/train.csv')
df_test_csv = pd.read_csv('/Users/brenda/Downloads/child-mind-institute-problematic-internet-use/test.csv')

In [None]:
# train data

# delete all rows with NA for sii column
df_train_csv_filter = df_train_csv[df_train_csv['sii'].notna()]

# Make sure features are same at train dataset and test dataset
common_columns = list(set(df_train_csv_filter.columns) & set(df_test_csv.columns))

common_columns_train = common_columns + ['sii']
df_train_csv_filter = df_train_csv_filter[sorted(common_columns_train)]

df_test_csv = df_test_csv[sorted(common_columns)]

# feature aggregation -- BMI, height, weight
df_train_csv_filter = df_train_csv_filter.drop(columns=['BIA-BIA_BMI', 'Physical-Weight', 'Physical-Height'])

# convert time (mins)
df_train_csv_filter['Fitness_Endurance-Time'] = df_train_csv_filter['Fitness_Endurance-Time_Mins'] + df_train_csv_filter['Fitness_Endurance-Time_Sec']/60
df_train_csv_filter = df_train_csv_filter.drop(columns=['Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec'])

# delete all season columns
season_columns = [col for col in df_train_csv_filter.columns if "Season" in col]
df_train_csv_filter = df_train_csv_filter.drop(columns=season_columns)

# test data

# feature aggregation -- BMI, height, weight
df_test_csv = df_test_csv.drop(columns=['BIA-BIA_BMI', 'Physical-Weight', 'Physical-Height'])

# convert time (mins)
df_test_csv['Fitness_Endurance-Time'] = df_test_csv['Fitness_Endurance-Time_Mins'] + df_test_csv['Fitness_Endurance-Time_Sec']/60
df_test_csv = df_test_csv.drop(columns=['Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec'])

# delete all season columns
season_columns = [col for col in df_test_csv.columns if "Season" in col]
df_test_csv = df_test_csv.drop(columns=season_columns)

In [None]:
# accelerometer (actigraphy) series
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = [fname for fname in os.listdir(dirname) if not fname.startswith('.')]  
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

train_stat = load_time_series("series_train.parquet")
test_stat = load_time_series("series_test.parquet")

# Merge data

In [93]:
df_train_raw_stat=df_train_csv_filter.merge(train_stat,on='id')
df_test_raw_stat=df_test_csv.merge(test_stat,on='id')

# Model

In [94]:
# Features and target
X = df_train_raw_stat.drop(columns=['id', 'Unnamed: 0', 'sii', 'Unnamed: 0.1'], axis=1)
y = df_train_raw_stat['sii']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [95]:
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = dict(zip(np.unique(y_train), class_weights))

sample_weights = y_train.map(class_weights_dict)

# New metrics to evaluate result

In [96]:
# 计算每个元素的 (col - raw)^2 / (rank - 1)
def transform_confusion_matrix(O):
    rows, cols = O.shape
    rank = max(rows, cols)  # 取矩阵的秩
    result = np.zeros_like(O, dtype=float)
    for i in range(rows):
        for j in range(cols):
            numerator = (j - i) ** 2
            denominator = (rank - 1)**2 if rank - 1 != 0 else 1  # 避免除以0
            result[i, j] = numerator / denominator
    return result


def get_E(pred, test, O):
    # 计算唯一值和对应的计数
    unique_test, counts_test = np.unique(test, return_counts=True)
    unique_pred, counts_pred = np.unique(pred, return_counts=True)

    # 找出 test 中有但 pred 中缺失的值
    missing_values = set(unique_test) - set(unique_pred)

    # 将缺失值添加到 unique_pred 和 counts_pred 中，并设置计数为0
    if missing_values:
        missing_values = sorted(list(missing_values))
        unique_pred = np.concatenate([unique_pred, missing_values])
        counts_pred = np.concatenate([counts_pred, [0] * len(missing_values)])

    # 按 unique_pred 重新排序，确保顺序一致
    sorted_indices = np.argsort(unique_pred)
    unique_pred = unique_pred[sorted_indices]
    counts_pred = counts_pred[sorted_indices]

    # 计算 counts_test 和 counts_pred 的外积
    outer_product = np.outer(counts_test, counts_pred)
    return outer_product*np.sum(O)/np.sum(outer_product)



def matric(pred, test, num_classes):
    pred = np.array(pred)
    test = np.array(test)
    
    # Generate the confusion matrix with all expected class labels
    O = confusion_matrix(test, pred, labels=np.arange(num_classes))
    W = transform_confusion_matrix(O)
    E = get_E(pred, test, O)  # Adjust get_E if it needs num_classes
    
    if not (O.shape == W.shape == E.shape):
        raise ValueError("O, W, and E must have the same shape.")
    
    numerator = np.sum(W * O)
    denominator = np.sum(W * E)
    
    if denominator == 0:
        raise ValueError("Denominator is zero, cannot calculate kappa.")
    
    kappa = 1 - (numerator / denominator)
    return kappa


In [98]:
def train_catoost(X_train, y_train, X_test, y_test, sample_weights=None, early_stopping_rounds=10):
    params = {
        'iterations':1000,
        'learning_rate':0.1,
        'depth':6,
        'loss_function':'MultiClass',
        'verbose':100,
        'class_weights':class_weights_dict,
        'random_state':42
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, verbose=False)
    
    # Evaluate the model on the test set using the default accuracy score
    preds = model.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, preds))
    
    # Evaluate using the custom metric
    custom_metric_score = matric(preds, y_test, num_classes=4)
    print(f"Custom Metric (Kappa): {custom_metric_score:.4f}")
    
    return model

# Example usage (ensure sample_weights are defined or pass None if not using)
baseline_model = train_catoost(X_train, y_train, X_test, y_test, sample_weights=None)


Classification Report:
               precision    recall  f1-score   support

         0.0       0.71      0.85      0.77       117
         1.0       0.42      0.26      0.33        53
         2.0       0.33      0.32      0.33        28
         3.0       0.00      0.00      0.00         2

    accuracy                           0.61       200
   macro avg       0.37      0.36      0.36       200
weighted avg       0.57      0.61      0.58       200

Custom Metric (Kappa): 0.4106


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Adjust hyperparameter

In [None]:
# def objective(trial):
#     # Define hyperparameters to tune for CatBoost
#     param = {
#         'iterations': trial.suggest_int('iterations', 100, 1000),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
#         'depth': trial.suggest_int('depth', 4, 10),
#         'loss_function': 'MultiClass',  
#         'random_seed': 42,
#         'verbose': False,
#         'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0)
#     }

#     # Using Stratified K-Fold for maintaining class distribution
#     kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     custom_scores = []  # To hold custom metric scores

#     for train_idx, valid_idx in kf.split(X_train, y_train):
#         X_train_fold, X_valid_fold = X_train.iloc[train_idx], X_train.iloc[valid_idx]
#         y_train_fold, y_valid_fold = y_train.iloc[train_idx], y_train.iloc[valid_idx]

#         # Initialize and fit the model with parameters
#         model = CatBoostClassifier(**param)
#         model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)])

#         # Predict on validation set
#         preds = model.predict(X_valid_fold)

#         # Calculate the custom metric, ensure matric can handle this
#         kappa_score = matric(preds, y_valid_fold, num_classes=4)  # Assuming num_classes parameter is required
#         custom_scores.append(kappa_score)

#         # Optional: Use trial pruning based on custom metric
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#     return np.mean(custom_scores)

# # Create and optimize an Optuna study
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# # Output the best parameters
# print("Best parameters:", study.best_params)


[I 2024-12-19 15:22:45,201] A new study created in memory with name: no-name-ffc7a2c0-41aa-4f60-9600-c51eca8e317c
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0)
[I 2024-12-19 15:24:18,118] Trial 0 finished with value: 0.13540892499275153 and parameters: {'iterations': 138, 'learning_rate': 0.049472861950170754, 'depth': 10, 'l2_leaf_reg': 0.0017416972257690002}. Best is trial 0 with value: 0.13540892499275153.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0)
[I 2024-12-19 15:26:55,050] Trial 1 finished with value: 0.22930321056512115 and parameters: {'iterations': 928, 'learning_rate': 0.2884130764278226, 'depth': 8, 'l2_leaf_reg': 0.19067267539839683}. Best is trial 1 with value: 0.22930321056512115.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.sugge

Best parameters: {'iterations': 1000, 'learning_rate': 0.042699302907617476, 'depth': 4, 'l2_leaf_reg': 9.478264516370832}


In [101]:
# Best parameters from your hyperparameter tuning
best_params = {
        'iterations':1000,
        'learning_rate':0.1,
        'depth':6,
        'loss_function':'MultiClass',
        'verbose':100,
        'class_weights':class_weights_dict,
        'random_state':42
    }

# Initialize the CatBoost Classifier with best parameters
final_model = CatBoostClassifier(**best_params)

# Fit the model on the entire training data
final_model.fit(X_train, y_train)

# Predict on the test set
preds = final_model.predict(X_test)

# Output the classification report and accuracy
print("Classification Report:\n", classification_report(y_test, preds))
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, preds) * 100))

# Correctly calculating the custom metric
custom_metric_score = matric(preds, y_test, num_classes=4)
print(f"Custom Metric (Kappa): {custom_metric_score:.4f}")


0:	learn: 1.3385279	total: 35.3ms	remaining: 35.3s
100:	learn: 0.4073250	total: 1.59s	remaining: 14.1s
200:	learn: 0.2276827	total: 3.03s	remaining: 12s
300:	learn: 0.1502110	total: 6.45s	remaining: 15s
400:	learn: 0.1073305	total: 9.55s	remaining: 14.3s
500:	learn: 0.0793227	total: 12.8s	remaining: 12.8s
600:	learn: 0.0622787	total: 16.1s	remaining: 10.7s
700:	learn: 0.0502381	total: 20.5s	remaining: 8.73s
800:	learn: 0.0417369	total: 25.7s	remaining: 6.37s
900:	learn: 0.0352036	total: 31.6s	remaining: 3.47s
999:	learn: 0.0301700	total: 37.3s	remaining: 0us
Classification Report:
               precision    recall  f1-score   support

         0.0       0.71      0.85      0.77       117
         1.0       0.42      0.26      0.33        53
         2.0       0.33      0.32      0.33        28
         3.0       0.00      0.00      0.00         2

    accuracy                           0.61       200
   macro avg       0.37      0.36      0.36       200
weighted avg       0.57      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# pred

In [99]:
test_data = df_test_raw_stat.drop(columns=['id', 'Unnamed: 0.1', 'Unnamed: 0'], errors='ignore')

In [103]:
test_preds = final_model.predict(test_data)

test_data['sii_prediction'] = test_preds

In [104]:
# combine new DataFrame
result_df = pd.DataFrame({
    'id': id,
    'sii_prediction': test_data['sii_prediction']
})

print(result_df.head())

         id  sii_prediction
0  00115b9f             1.0
1  001f3379             1.0


In [105]:
result_df.to_csv("submission.csv", index=False)
print("Final DataFrame saved to 'submission.csv'")

Final DataFrame saved to 'final_predictions.csv'
