# [IDPP CLEF Challlenge](http://brainteaser.dei.unipd.it/challenges/idpp2023/)


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRegressor
import lightgbm

# machine learning
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import r_regression
from sklearn import preprocessing

In [None]:
DATASET = "datasetA"
DATASET_DIR = f"../data/{DATASET}_train"

ID_FEAT = "patient_id"

pd.set_option('display.max_rows', None)

# Load and Merge data

In [None]:
def filenames_in_folder(dir_path):
    file_names = []
    for _, __, f_names in os.walk(dir_path):
        for file_name in f_names:
            file_names.append(file_name)
        break
    return file_names

def read_dfs(dir_path):
    file_names = filenames_in_folder(dir_path)
    dfs = {file_name.removesuffix(".csv"): pd.read_csv(os.path.join(dir_path, file_name)) for file_name in file_names if file_name.endswith("csv")}
    return dfs

def merge_csv_in_dir(dfs, dataset):

    def transpose_cols_to_rows_by_uniques(orig_df, id_feat, other_feature):
        transposed_df = pd.DataFrame(orig_df.groupby(id_feat)[other_feature].apply(lambda x: x.values).values.tolist(), index=orig_df[id_feat].unique())
        transposed_df.columns = [f'{other_feature}_{i:02d}' for i in range(1, len(transposed_df.columns) + 1)]
        return transposed_df

    def transpose_cols_to_rows_by_1st_unique(orig_df, id_feat, other_feature):
        transposed_df = pd.DataFrame(orig_df.groupby(id_feat)[other_feature].apply(lambda x: x.values[0]),
                         index=orig_df[id_feat].unique())
        return transposed_df

    def transpose_df_by_uniques(orig_df, id_feat, time_series_feats, one_occurrence_feats, sort_by_this_feat):
        orig_df = orig_df.sort_values(by=[ID_FEAT, sort_by_this_feat])
        ts_dfs = [transpose_cols_to_rows_by_uniques(orig_df, id_feat, ts_feat) for ts_feat in time_series_feats]
        oo_dfs = [transpose_cols_to_rows_by_1st_unique(orig_df, id_feat, oo_feat) for oo_feat in one_occurrence_feats]

        out_df = pd.concat([*oo_dfs, *ts_dfs], axis=1)
        out_df.reset_index(names=id_feat, inplace=True)
        return out_df

    merged_df = pd.merge(dfs[f"{dataset}_train-static-vars"], dfs[f"{dataset}_train-outcomes"],
                         on="patient_id", how="outer")

    relapses_df = dfs[f"{dataset}_train-relapses"]
    ts_feats = ["delta_relapse_time0"]
    oo_feats = ["centre"]
    sort_by_this_feat = "delta_relapse_time0"
    relapses_df = transpose_df_by_uniques(relapses_df, ID_FEAT, ts_feats, oo_feats, sort_by_this_feat)

    ms_type_df = dfs[f"{dataset}_train-ms-type"]
    ts_feats = ["multiple_sclerosis_type", "delta_observation_time0"]
    oo_feats = ["centre"]
    sort_by_this_feat = "delta_observation_time0"
    ms_type_df = transpose_df_by_uniques(ms_type_df, ID_FEAT, ts_feats, oo_feats, sort_by_this_feat)

    mri_df = dfs[f"{dataset}_train-mri"]
    ts_feats = ["mri_area_label", "lesions_T1", "lesions_T1_gadolinium", "number_of_lesions_T1_gadolinium",
                "new_or_enlarged_lesions_T2", "number_of_new_or_enlarged_lesions_T2", "lesions_T2", "number_of_total_lesions_T2", "delta_mri_time0"]
    oo_feats = ["centre"]
    sort_by_this_feat = "delta_mri_time0"
    mri_df = transpose_df_by_uniques(mri_df, ID_FEAT, ts_feats, oo_feats, sort_by_this_feat)

    evoked_p_df = dfs[f"{dataset}_train-evoked-potentials"]
    ts_feats = ["altered_potential", "potential_value", "location", "delta_evoked_potential_time0"]
    oo_feats = ["centre"]
    sort_by_this_feat = "delta_evoked_potential_time0"
    evoked_p_df = transpose_df_by_uniques(evoked_p_df, ID_FEAT, ts_feats, oo_feats, sort_by_this_feat)


    edss_df = dfs[f"{dataset}_train-edss"]
    ts_feats = ["edss_as_evaluated_by_clinician", "delta_edss_time0"]
    oo_feats = ["centre"]
    sort_by_this_feat = "delta_edss_time0"
    edss_df = transpose_df_by_uniques(edss_df, ID_FEAT, ts_feats, oo_feats, sort_by_this_feat)

    grouped_dfs = [edss_df, relapses_df, ms_type_df, evoked_p_df, mri_df]
    for df in grouped_dfs:
        merged_df = pd.merge(merged_df, df, on=[ID_FEAT, "centre"], how="outer")

    return merged_df

In [None]:
dfs = read_dfs(DATASET_DIR)
df = merge_csv_in_dir(dfs, DATASET)
df = df.dropna(axis=1, how="all")
print(df.columns.values)
df.head()

In [None]:
df.describe()

In [None]:
print(df.columns.to_series().groupby(df.dtypes).groups)
df.info()

In [None]:
missing_values = ((df.isnull().sum() * 100 / len(df)).sort_values(ascending=False))
print("Missing value rate:\n", missing_values.to_string())

In [None]:
sns.displot(df["outcome_time"],kde=True, legend=True)
sns.displot(df["outcome_occurred"],kde=True, legend=True)


In [None]:
# corr_mat = df[continuous_values].corr()
# f, ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(corr_mat,  vmax=1, cmap="viridis", square=True)


# Preprocess data

In [None]:
def collapse_ts_feature_cols(df, feature, start_idx, end_idx=None):
    selected_cols = [col_name for col_name in df.columns.values.tolist() if col_name.startswith(feature)]
    if end_idx:
        cols_to_collapse = [col for col in selected_cols if (start_idx <= int(col[-2:] < end_idx))]
        new_feat_name = f"{feature}_{start_idx}-{end_idx}"
    else:
        cols_to_collapse = [col for col in selected_cols if (start_idx <= int(col[-2:]))]
        new_feat_name = f"{feature}_{start_idx}+"

    df[new_feat_name] = df[cols_to_collapse].isna().all(axis=1)
    df[new_feat_name] = df[new_feat_name].astype(int)
    df = df.drop(cols_to_collapse, axis=1)
    return df

In [None]:
feats_to_be_collapsed = [("new_or_enlarged_lesions_T2", 5, None),
                         ("number_of_new_or_enlarged_lesions_T2",   5, None),
                         ("altered_potential", 9, None),
                         ("potential_value", 9, None),
                         ("delta_relapse_time0", 3, None),
                         ("mri_area_label", 6, None),
                         ("delta_mri_time0", 6, None),
                         ("lesions_T1", 3, None),
                         ("lesions_T2", 3, None),
                         ("delta_evoked_potential_time0", 9, None),
                         ("lesions_T1_gadolinium", 5, None),
                         ("number_of_lesions_T1_gadolinium", 6, None),
                         ("edss_as_evaluated_by_clinician", 11, None),
                         ("location", 9, None),
                         ("delta_edss_time0", 10, None),
                         ("number_of_total_lesions_T2", 3, None)]

def collapse_cols(df, feats_to_be_collapsed):
    for feat in feats_to_be_collapsed:
        df = collapse_ts_feature_cols(df, *feat)
    return df

df = collapse_cols(df, feats_to_be_collapsed)


In [None]:
missing_values = ((df.isnull().sum() * 100 / len(df)).sort_values(ascending=False))
print("Missing value rate:\n", missing_values.to_string())
print(df.columns.to_series().groupby(df.dtypes).groups)
df.head()

In [None]:
values = ['outcome_occurred', "outcome_time",'ms_in_pediatric_age', 'spinal_cord_symptom', 'brainstem_symptom', 'eye_symptom', 'supratentorial_symptom','new_or_enlarged_lesions_T2_5+', 'number_of_new_or_enlarged_lesions_T2_5+', 'altered_potential_9+', 'potential_value_9+', 'delta_relapse_time0_3+', 'mri_area_label_6+', 'delta_mri_time0_6+', 'lesions_T1_3+', 'lesions_T2_3+', 'delta_evoked_potential_time0_9+', 'lesions_T1_gadolinium_5+', 'number_of_lesions_T1_gadolinium_6+', 'edss_as_evaluated_by_clinician_11+', 'location_9+', 'delta_edss_time0_10+', 'number_of_total_lesions_T2_3+','age_at_onset', 'time_since_onset', 'diagnostic_delay', 'delta_relapse_time0_01', 'delta_relapse_time0_02', 'delta_observation_time0_01', 'delta_observation_time0_02', 'number_of_lesions_T1_gadolinium_01', 'number_of_lesions_T1_gadolinium_02', 'number_of_lesions_T1_gadolinium_03', 'number_of_lesions_T1_gadolinium_04', 'number_of_lesions_T1_gadolinium_05', 'number_of_new_or_enlarged_lesions_T2_01', 'number_of_new_or_enlarged_lesions_T2_02', 'number_of_new_or_enlarged_lesions_T2_03', 'number_of_new_or_enlarged_lesions_T2_04', 'delta_mri_time0_01', 'delta_mri_time0_02', 'delta_mri_time0_03', 'delta_mri_time0_04', 'delta_mri_time0_05', 'delta_evoked_potential_time0_01', 'delta_evoked_potential_time0_02', 'delta_evoked_potential_time0_03', 'delta_evoked_potential_time0_04', 'delta_evoked_potential_time0_05', 'delta_evoked_potential_time0_06', 'delta_evoked_potential_time0_07', 'delta_evoked_potential_time0_08', 'edss_as_evaluated_by_clinician_01', 'edss_as_evaluated_by_clinician_02', 'edss_as_evaluated_by_clinician_03', 'edss_as_evaluated_by_clinician_04', 'edss_as_evaluated_by_clinician_05', 'edss_as_evaluated_by_clinician_06', 'edss_as_evaluated_by_clinician_07', 'edss_as_evaluated_by_clinician_08', 'edss_as_evaluated_by_clinician_09', 'edss_as_evaluated_by_clinician_10', 'delta_edss_time0_01', 'delta_edss_time0_02', 'delta_edss_time0_03', 'delta_edss_time0_04', 'delta_edss_time0_05', 'delta_edss_time0_06', 'delta_edss_time0_07', 'delta_edss_time0_08', 'delta_edss_time0_09']
corr_mat = df[values].corr()
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr_mat,  vmax=1, cmap="viridis", square=True)




In [None]:
import fastcore
from fastai.tabular.all import RandomSplitter, range_of, TabularPandas, Categorify, FillMissing, Normalize

cols_typed = {"bool": ['ms_in_pediatric_age', 'spinal_cord_symptom', 'brainstem_symptom', 'eye_symptom', 'supratentorial_symptom'],
     "int32": ['new_or_enlarged_lesions_T2_5+', 'number_of_new_or_enlarged_lesions_T2_5+', 'altered_potential_9+', 'potential_value_9+', 'delta_relapse_time0_3+', 'mri_area_label_6+', 'delta_mri_time0_6+', 'lesions_T1_3+', 'lesions_T2_3+', 'delta_evoked_potential_time0_9+', 'lesions_T1_gadolinium_5+', 'number_of_lesions_T1_gadolinium_6+', 'edss_as_evaluated_by_clinician_11+', 'location_9+', 'delta_edss_time0_10+', 'number_of_total_lesions_T2_3+'],
     "int64": ['age_at_onset', 'time_since_onset'],
     "float64": ['diagnostic_delay', 'delta_relapse_time0_01', 'delta_relapse_time0_02', 'delta_observation_time0_01', 'delta_observation_time0_02', 'number_of_lesions_T1_gadolinium_01', 'number_of_lesions_T1_gadolinium_02', 'number_of_lesions_T1_gadolinium_03', 'number_of_lesions_T1_gadolinium_04', 'number_of_lesions_T1_gadolinium_05', 'number_of_new_or_enlarged_lesions_T2_01', 'number_of_new_or_enlarged_lesions_T2_02', 'number_of_new_or_enlarged_lesions_T2_03', 'number_of_new_or_enlarged_lesions_T2_04', 'delta_mri_time0_01', 'delta_mri_time0_02', 'delta_mri_time0_03', 'delta_mri_time0_04', 'delta_mri_time0_05', 'delta_evoked_potential_time0_01', 'delta_evoked_potential_time0_02', 'delta_evoked_potential_time0_03', 'delta_evoked_potential_time0_04', 'delta_evoked_potential_time0_05', 'delta_evoked_potential_time0_06', 'delta_evoked_potential_time0_07', 'delta_evoked_potential_time0_08', 'edss_as_evaluated_by_clinician_01', 'edss_as_evaluated_by_clinician_02', 'edss_as_evaluated_by_clinician_03', 'edss_as_evaluated_by_clinician_04', 'edss_as_evaluated_by_clinician_05', 'edss_as_evaluated_by_clinician_06', 'edss_as_evaluated_by_clinician_07', 'edss_as_evaluated_by_clinician_08', 'edss_as_evaluated_by_clinician_09', 'edss_as_evaluated_by_clinician_10', 'delta_edss_time0_01', 'delta_edss_time0_02', 'delta_edss_time0_03', 'delta_edss_time0_04', 'delta_edss_time0_05', 'delta_edss_time0_06', 'delta_edss_time0_07', 'delta_edss_time0_08', 'delta_edss_time0_09'],
     "object": [ 'sex', 'residence_classification', 'ethnicity', 'other_symptoms', 'centre', 'multiple_sclerosis_type_01', 'multiple_sclerosis_type_02', 'mri_area_label_01', 'mri_area_label_02', 'mri_area_label_03', 'mri_area_label_04', 'mri_area_label_05', 'lesions_T1_01', 'lesions_T1_02', 'lesions_T1_gadolinium_01', 'lesions_T1_gadolinium_02', 'new_or_enlarged_lesions_T2_01', 'new_or_enlarged_lesions_T2_02', 'new_or_enlarged_lesions_T2_03', 'new_or_enlarged_lesions_T2_04', 'lesions_T2_01', 'lesions_T2_02', 'number_of_total_lesions_T2_01', 'number_of_total_lesions_T2_02', 'altered_potential_01', 'altered_potential_02', 'altered_potential_03', 'altered_potential_04', 'altered_potential_05', 'altered_potential_06', 'altered_potential_07', 'altered_potential_08', 'potential_value_01', 'potential_value_02', 'potential_value_03', 'potential_value_04', 'potential_value_05', 'potential_value_06', 'potential_value_07', 'potential_value_08', 'location_01', 'location_02', 'location_03', 'location_04', 'location_05', 'location_06', 'location_07', 'location_08']}

cat_names = [*cols_typed["bool"], *cols_typed["object"]]
cont_names = [*cols_typed["int32"], *cols_typed["int64"], *cols_typed["float64"]]

splits = RandomSplitter(valid_pct=0.2)(range_of(df))

to = TabularPandas(df, procs=[Categorify, FillMissing],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='outcome_occurred',
                   splits=splits)

In [None]:
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()

In [None]:
log_reg = LogisticRegression(max_iter=1000)
svc = SVC()
knn = KNeighborsClassifier(n_neighbors=5)
gauss = GaussianNB()
perceptron = Perceptron()
linear_svc = LinearSVC()
sgd = SGDClassifier()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(n_estimators=200, n_jobs=-1)

xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
                  gamma=0, gpu_id=-1, importance_type='gain',
                  interaction_constraints='', learning_rate=0.300000012,
                  max_delta_step=0, max_depth=10, min_child_weight=1,
                  monotone_constraints='()', n_estimators=1000, n_jobs=20,
                  num_parallel_tree=1, objective='binary:logistic', random_state=0,
                  reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
                  tree_method='exact',
                  validate_parameters=1, verbosity=None)

lgm = lightgbm.LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
                random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

models = [log_reg, svc, knn, gauss, perceptron, linear_svc, sgd, decision_tree, random_forest, xgb, lgm]


In [None]:
accuracies = {}

for model in models:
    model.fit(X_train, y_train)
    train_acc = round(model.score(X_train, y_train) * 100, 2)

    val_acc = round(model.score(X_valid, y_valid) * 100, 2)
    accuracies[model.__class__.__name__] = {"train_acc": train_acc, "val_acc": val_acc}

classification_preds = lgm.predict(X_valid)
acc_df = pd.DataFrame(accuracies).transpose().sort_values("val_acc", ascending=False)
acc_df

In [None]:
coeff_df = pd.DataFrame(X_train.columns.values)
coeff_df.columns = ["Feature"]
coeff_df["Correlation"] = pd.Series(models[0].coef_[0])

coeff_df.sort_values("Correlation", ascending=False)


# Regression

In [None]:
cols_typed = {"bool": ['ms_in_pediatric_age', 'spinal_cord_symptom', 'brainstem_symptom', 'eye_symptom', 'supratentorial_symptom'],
     "int32": ['new_or_enlarged_lesions_T2_5+', 'number_of_new_or_enlarged_lesions_T2_5+', 'altered_potential_9+', 'potential_value_9+', 'delta_relapse_time0_3+', 'mri_area_label_6+', 'delta_mri_time0_6+', 'lesions_T1_3+', 'lesions_T2_3+', 'delta_evoked_potential_time0_9+', 'lesions_T1_gadolinium_5+', 'number_of_lesions_T1_gadolinium_6+', 'edss_as_evaluated_by_clinician_11+', 'location_9+', 'delta_edss_time0_10+', 'number_of_total_lesions_T2_3+'],
     "int64": ['age_at_onset', 'time_since_onset', "outcome_occurred"],
     "float64": ['diagnostic_delay', 'outcome_time', 'delta_relapse_time0_01', 'delta_relapse_time0_02', 'delta_observation_time0_01', 'delta_observation_time0_02', 'number_of_lesions_T1_gadolinium_01', 'number_of_lesions_T1_gadolinium_02', 'number_of_lesions_T1_gadolinium_03', 'number_of_lesions_T1_gadolinium_04', 'number_of_lesions_T1_gadolinium_05', 'number_of_new_or_enlarged_lesions_T2_01', 'number_of_new_or_enlarged_lesions_T2_02', 'number_of_new_or_enlarged_lesions_T2_03', 'number_of_new_or_enlarged_lesions_T2_04', 'delta_mri_time0_01', 'delta_mri_time0_02', 'delta_mri_time0_03', 'delta_mri_time0_04', 'delta_mri_time0_05', 'delta_evoked_potential_time0_01', 'delta_evoked_potential_time0_02', 'delta_evoked_potential_time0_03', 'delta_evoked_potential_time0_04', 'delta_evoked_potential_time0_05', 'delta_evoked_potential_time0_06', 'delta_evoked_potential_time0_07', 'delta_evoked_potential_time0_08', 'edss_as_evaluated_by_clinician_01', 'edss_as_evaluated_by_clinician_02', 'edss_as_evaluated_by_clinician_03', 'edss_as_evaluated_by_clinician_04', 'edss_as_evaluated_by_clinician_05', 'edss_as_evaluated_by_clinician_06', 'edss_as_evaluated_by_clinician_07', 'edss_as_evaluated_by_clinician_08', 'edss_as_evaluated_by_clinician_09', 'edss_as_evaluated_by_clinician_10', 'delta_edss_time0_01', 'delta_edss_time0_02', 'delta_edss_time0_03', 'delta_edss_time0_04', 'delta_edss_time0_05', 'delta_edss_time0_06', 'delta_edss_time0_07', 'delta_edss_time0_08', 'delta_edss_time0_09'],
     "object": [ 'sex', 'residence_classification', 'ethnicity', 'other_symptoms', 'centre', 'multiple_sclerosis_type_01', 'multiple_sclerosis_type_02', 'mri_area_label_01', 'mri_area_label_02', 'mri_area_label_03', 'mri_area_label_04', 'mri_area_label_05', 'lesions_T1_01', 'lesions_T1_02', 'lesions_T1_gadolinium_01', 'lesions_T1_gadolinium_02', 'new_or_enlarged_lesions_T2_01', 'new_or_enlarged_lesions_T2_02', 'new_or_enlarged_lesions_T2_03', 'new_or_enlarged_lesions_T2_04', 'lesions_T2_01', 'lesions_T2_02', 'number_of_total_lesions_T2_01', 'number_of_total_lesions_T2_02', 'altered_potential_01', 'altered_potential_02', 'altered_potential_03', 'altered_potential_04', 'altered_potential_05', 'altered_potential_06', 'altered_potential_07', 'altered_potential_08', 'potential_value_01', 'potential_value_02', 'potential_value_03', 'potential_value_04', 'potential_value_05', 'potential_value_06', 'potential_value_07', 'potential_value_08', 'location_01', 'location_02', 'location_03', 'location_04', 'location_05', 'location_06', 'location_07', 'location_08']}

cat_names = [*cols_typed["bool"], *cols_typed["object"]]
cont_names = [*cols_typed["int32"], *cols_typed["int64"], *cols_typed["float64"]]

splits = RandomSplitter(valid_pct=0.2)(range_of(df))

to = TabularPandas(df, procs=[Categorify, FillMissing, Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='outcome_time',
                   splits=splits)

X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()

In [None]:
# y_train = df["outcome_time"]
# train_df = df.drop(["outcome_time", "patient_id"], axis=1)

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=43).get_n_splits(X_train.values)
    rmse = np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return (rmse)

## Models

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state =5)
model_xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
                             learning_rate=0.05, max_depth=3,
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213,
                             random_state =7, nthread = -1)
model_lgb = lightgbm.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

regressors = {"Lasso": lasso, "ElasticNet": ENet, "Kernel Ridge": KRR,
              "Gradient Boosting": GBoost, "XGBoost": model_xgb,
              "LGBM": model_lgb}

In [None]:
for model_name, model in regressors.items():
    score = rmsle_cv(model)
    print(f"{model_name} score: {score.mean():.4f} ({score.std():.4f})")


In [None]:
# sns.kdeplot(y_valid)
# print(min(y_valid), max(y_valid), 15/(max(y_valid)-min(y_valid)))
# tmp = 15/(max(y_valid)-min(y_valid))

In [None]:
mean, std = to.means["outcome_time"], to.stds["outcome_time"]

In [None]:
model = regressors["Lasso"]
model.fit(X_train, y_train)

preds = model.predict(X_valid)

pred_df = pd.DataFrame({"ground truth":y_valid*std + mean, "preds": preds*std + mean,
                        "MSE": ((y_valid*std + mean - (preds*std + mean))**2)/len(y_valid),
                        "MAE": (abs(y_valid*std + mean - (preds*std + mean)))/len(y_valid)})
print(f"Total error:\nMSE: {sum(pred_df['MSE'])}\tMAE: {sum(pred_df['MAE'])}")
pred_df.head(20)


# Scores and output

In [None]:
# TEAM_IDENTIFIER = uwb_T1a_metric_method
# Harell's C-index: https://lifelines.readthedocs.io/en/latest/lifelines.utils.html

lasso.fit(X_train, y_train)
regression_preds = lasso.predict(X_valid)
from lifelines.utils.concordance import concordance_index
c_index = concordance_index(regression_preds, classification_preds)

c_index




In [None]:
DATASET = "datasetB"
DATASET_DIR = f"../data/{DATASET}_train"

dfs = read_dfs(DATASET_DIR)
df = merge_csv_in_dir(dfs, DATASET)
df = df.dropna(axis=1, how="all")

df = collapse_cols(df, feats_to_be_collapsed)

cols_typed = {"bool": ['ms_in_pediatric_age', 'spinal_cord_symptom', 'brainstem_symptom', 'eye_symptom', 'supratentorial_symptom'],
     "int32": ['new_or_enlarged_lesions_T2_5+', 'number_of_new_or_enlarged_lesions_T2_5+', 'altered_potential_9+', 'potential_value_9+', 'delta_relapse_time0_3+', 'mri_area_label_6+', 'delta_mri_time0_6+', 'lesions_T1_3+', 'lesions_T2_3+', 'delta_evoked_potential_time0_9+', 'lesions_T1_gadolinium_5+', 'number_of_lesions_T1_gadolinium_6+', 'edss_as_evaluated_by_clinician_11+', 'location_9+', 'delta_edss_time0_10+', 'number_of_total_lesions_T2_3+'],
     "int64": ['age_at_onset', 'time_since_onset'],
     "float64": ['diagnostic_delay', 'delta_relapse_time0_01', 'delta_relapse_time0_02', 'delta_observation_time0_01', 'delta_observation_time0_02', 'number_of_lesions_T1_gadolinium_01', 'number_of_lesions_T1_gadolinium_02', 'number_of_lesions_T1_gadolinium_03', 'number_of_lesions_T1_gadolinium_04', 'number_of_lesions_T1_gadolinium_05', 'number_of_new_or_enlarged_lesions_T2_01', 'number_of_new_or_enlarged_lesions_T2_02', 'number_of_new_or_enlarged_lesions_T2_03', 'number_of_new_or_enlarged_lesions_T2_04', 'delta_mri_time0_01', 'delta_mri_time0_02', 'delta_mri_time0_03', 'delta_mri_time0_04', 'delta_mri_time0_05', 'delta_evoked_potential_time0_01', 'delta_evoked_potential_time0_02', 'delta_evoked_potential_time0_03', 'delta_evoked_potential_time0_04', 'delta_evoked_potential_time0_05', 'delta_evoked_potential_time0_06', 'delta_evoked_potential_time0_07', 'delta_evoked_potential_time0_08', 'edss_as_evaluated_by_clinician_01', 'edss_as_evaluated_by_clinician_02', 'edss_as_evaluated_by_clinician_03', 'edss_as_evaluated_by_clinician_04', 'edss_as_evaluated_by_clinician_05', 'edss_as_evaluated_by_clinician_06', 'edss_as_evaluated_by_clinician_07', 'edss_as_evaluated_by_clinician_08', 'edss_as_evaluated_by_clinician_09', 'edss_as_evaluated_by_clinician_10', 'delta_edss_time0_01', 'delta_edss_time0_02', 'delta_edss_time0_03', 'delta_edss_time0_04', 'delta_edss_time0_05', 'delta_edss_time0_06', 'delta_edss_time0_07', 'delta_edss_time0_08', 'delta_edss_time0_09'],
     "object": [ 'sex', 'residence_classification', 'ethnicity', 'other_symptoms', 'centre', 'multiple_sclerosis_type_01', 'multiple_sclerosis_type_02', 'mri_area_label_01', 'mri_area_label_02', 'mri_area_label_03', 'mri_area_label_04', 'mri_area_label_05', 'lesions_T1_01', 'lesions_T1_02', 'lesions_T1_gadolinium_01', 'lesions_T1_gadolinium_02', 'new_or_enlarged_lesions_T2_01', 'new_or_enlarged_lesions_T2_02', 'new_or_enlarged_lesions_T2_03', 'new_or_enlarged_lesions_T2_04', 'lesions_T2_01', 'lesions_T2_02', 'number_of_total_lesions_T2_01', 'number_of_total_lesions_T2_02', 'altered_potential_01', 'altered_potential_02', 'altered_potential_03', 'altered_potential_04', 'altered_potential_05', 'altered_potential_06', 'altered_potential_07', 'altered_potential_08', 'potential_value_01', 'potential_value_02', 'potential_value_03', 'potential_value_04', 'potential_value_05', 'potential_value_06', 'potential_value_07', 'potential_value_08', 'location_01', 'location_02', 'location_03', 'location_04', 'location_05', 'location_06', 'location_07', 'location_08']}

cat_names = [*cols_typed["bool"], *cols_typed["object"]]
cont_names = [*cols_typed["int32"], *cols_typed["int64"], *cols_typed["float64"]]

splits = RandomSplitter(valid_pct=0.2)(range_of(df))

to = TabularPandas(df, procs=[Categorify, FillMissing],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='outcome_occurred',
                   splits=splits)

X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()


accuracies = {}

for model in models:
    model.fit(X_train, y_train)
    train_acc = round(model.score(X_train, y_train) * 100, 2)

    val_acc = round(model.score(X_valid, y_valid) * 100, 2)
    accuracies[model.__class__.__name__] = {"train_acc": train_acc, "val_acc": val_acc}


acc_df = pd.DataFrame(accuracies).transpose().sort_values("val_acc", ascending=False)
acc_df