In [36]:
data_dir = "../custom_data"
run_id = 1
out_dir = "/home/omadbek/projects/run_all/out-notebooks-cta/ml-results"

In [37]:
model_id = 'sherlock'

In [38]:
%env PYTHONHASHSEED=13
%env PYTHONHASHSEED

env: PYTHONHASHSEED=13


'13'

In [39]:
from pandas.core.arrays.integer import dtype
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
import itertools

from ast import literal_eval
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
#from sklearn.ensemble import ExtraTreesClassifier,
#from sklearn.preprocessing import LabelEncoder
#from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate

from matplotlib import pyplot as plt
import time
import os

### Train

In [41]:
X_train = pd.read_parquet(f"{data_dir}/processed/train.parquet")
y_train = pd.read_parquet(f"{data_dir}/raw/train_labels.parquet").values.flatten()

y_train = np.array([x.lower() for x in y_train])

### Validation

In [42]:
X_validation = pd.read_parquet(f"{data_dir}/processed/validation.parquet")
y_validation = pd.read_parquet(f"{data_dir}/raw/validation_labels.parquet").values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

### Testing

In [43]:
X_test = pd.read_parquet(f"{data_dir}/processed/test.parquet")
y_test = pd.read_parquet(f"{data_dir}/raw/test_labels.parquet").values.flatten()

y_test = np.array([x.lower() for x in y_test])

### Contact train and val

In [44]:
X_train = pd.concat([X_train, X_validation], ignore_index=True)
y_train = np.array([x.lower() for x in itertools.chain(y_train, y_validation)])

In [45]:
def save_metrics(name: str, raw_report, y_true, run_id: int, inference_time):

    run_name = f"{name}-run-{run_id}"

    n = len(y_true)

    # 2. flatten into a single dict
    flat = {}
    for label, metrics in raw_report.items():
        if label == "accuracy":
            flat["accuracy"] = metrics
        else:
            for metric_name, val in metrics.items():
                # replace any dashes so your CSV columns are valid identifiers
                clean_metric = metric_name.replace("-", "_")
                flat[f"{label}_{clean_metric}"] = val


    flat["total_entries"] = n
    flat["run_name"]      = run_name
    flat["inference_time"] = f"{inference_time:.2f}s"


    df = pd.DataFrame([flat])
    metrics_csv = os.path.join(out_dir, f"{name}-metrics.csv")

    # only write header if file doesn’t exist
    if not os.path.isfile(metrics_csv):
        df.to_csv(metrics_csv, index=False, float_format="%.4f")
    else:
        df.to_csv(metrics_csv, mode="a", header=False, index=False, float_format="%.4f")

## Cross Validation

In [46]:
def mstd(x):
    return f"{np.mean(x):.3f} ± {np.std(x):.3f}"

def cross_validation(model):
    # --- 5-fold stratified CV on the 80% trainval ---
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
    scoring = {
        "f1_macro": "f1_macro",
        "f1_weighted": "f1_weighted",
        "accuracy": "accuracy",
    }

    cv_res = cross_validate(
        estimator=model,
        X=X_train,
        y=y_train_enc,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False,
        return_estimator=False,
    )

    print("[CV 5-fold] "
      f"macro-F1: {mstd(cv_res['test_f1_macro'])} | "
      f"weighted-F1: {mstd(cv_res['test_f1_weighted'])} | "
      f"acc: {mstd(cv_res['test_accuracy'])}")

    print(f"fit_time per fold (s): {mstd(cv_res['fit_time'])}, score_time per fold (s): {mstd(cv_res['score_time'])}")

def results(y_test_enc, y_pred_test, le):

    print("\n[Holdout 20% test]")
    print("macro-F1:", f1_score(y_test_enc, y_pred_test, average="macro"))
    print("weighted-F1:", f1_score(y_test_enc, y_pred_test, average="weighted"))
    print("accuracy:", accuracy_score(y_test_enc, y_pred_test))
    print("\nPer-class report:")
    print(classification_report(y_test_enc, y_pred_test, target_names=le.classes_))

### Train Voting Classifier using RFC and ETC

In [47]:
le = LabelEncoder()
le.fit(y_train)                     # fit on trainval only
y_train_enc = le.transform(y_train)
y_test_enc  = le.transform(y_test)  # ensure all test labels exist in trainval

In [48]:
# Model
voting_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=13, n_jobs=-1)),
        ('et', ExtraTreesClassifier(n_estimators=100, random_state=13, n_jobs=-1))
    ],
    voting='soft'
)


In [49]:
cross_validation(voting_clf)

voting_clf.fit(X_train, y_train_enc)
y_pred_test = voting_clf.predict(X_test)

results(y_test_enc, y_pred_test, le)

[CV 5-fold] macro-F1: 0.655 ± 0.076 | weighted-F1: 0.855 ± 0.047 | acc: 0.873 ± 0.040
fit_time per fold (s): 0.541 ± 0.096, score_time per fold (s): 0.270 ± 0.025

[Holdout 20% test]
macro-F1: 0.833132444897151
weighted-F1: 0.9113178400820063
accuracy: 0.9174311926605505

Per-class report:
                 precision    recall  f1-score   support

            age       1.00      0.50      0.67         4
    case_status       0.80      0.80      0.80         5
contact_setting       1.00      0.50      0.67         2
           date       0.92      0.96      0.94        25
         gender       1.00      1.00      1.00         4
             id       1.00      1.00      1.00         9
       location       0.80      1.00      0.89        12
medical_boolean       0.95      0.97      0.96        40
     occupation       1.00      0.50      0.67         2
        outcome       0.67      0.50      0.57         4
       symptoms       1.00      1.00      1.00         2

       accuracy        

# RFC

In [50]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=13, n_jobs=-1)

cross_validation(rf_model)

rf_model.fit(X_train, y_train_enc)

y_pred_test_rf = rf_model.predict(X_test)

results(y_test_enc, y_pred_test_rf, le)

[CV 5-fold] macro-F1: 0.640 ± 0.063 | weighted-F1: 0.849 ± 0.036 | acc: 0.869 ± 0.033
fit_time per fold (s): 0.251 ± 0.004, score_time per fold (s): 0.137 ± 0.001

[Holdout 20% test]
macro-F1: 0.8369945034333277
weighted-F1: 0.9197056599967546
accuracy: 0.926605504587156

Per-class report:
                 precision    recall  f1-score   support

            age       1.00      1.00      1.00         4
    case_status       0.67      0.80      0.73         5
contact_setting       1.00      0.50      0.67         2
           date       1.00      0.96      0.98        25
         gender       1.00      1.00      1.00         4
             id       0.90      1.00      0.95         9
       location       0.86      1.00      0.92        12
medical_boolean       0.95      0.97      0.96        40
     occupation       1.00      0.50      0.67         2
        outcome       0.50      0.25      0.33         4
       symptoms       1.00      1.00      1.00         2

       accuracy        

# ETC

In [51]:
etc_model = ExtraTreesClassifier(n_estimators=100, random_state=13, n_jobs=-1)

cross_validation(etc_model)

etc_model.fit(X_train, y_train_enc)

y_pred_test_etc = etc_model.predict(X_test)

results(y_test_enc, y_pred_test_etc, le)

[CV 5-fold] macro-F1: 0.657 ± 0.083 | weighted-F1: 0.857 ± 0.050 | acc: 0.873 ± 0.044
fit_time per fold (s): 0.146 ± 0.003, score_time per fold (s): 0.119 ± 0.002

[Holdout 20% test]
macro-F1: 0.8303085803085803
weighted-F1: 0.9024095170884162
accuracy: 0.908256880733945

Per-class report:
                 precision    recall  f1-score   support

            age       1.00      0.50      0.67         4
    case_status       0.80      0.80      0.80         5
contact_setting       1.00      0.50      0.67         2
           date       0.89      0.96      0.92        25
         gender       1.00      1.00      1.00         4
             id       1.00      1.00      1.00         9
       location       0.80      1.00      0.89        12
medical_boolean       0.95      0.95      0.95        40
     occupation       1.00      0.50      0.67         2
        outcome       0.67      0.50      0.57         4
       symptoms       1.00      1.00      1.00         2

       accuracy        

### Features

In [None]:
# # 1) Get importances
# rf_imp = rf_clf.feature_importances_
# et_imp = et_clf.feature_importances_
#
# # 2) Average them (simple mean)
# avg_imp = (rf_imp + et_imp) / 2.0
#
# # 3) Create a DataFrame for easy sorting/plotting
# feat_names = X_train.columns  # or whatever your feature names are
# imp_df = pd.DataFrame({
#     'feature': feat_names,
#     'rf': rf_imp,
#     'et': et_imp,
#     'avg': avg_imp
# }).sort_values('avg', ascending=False)
#
# # 4) View top 10
# print(imp_df.head(10))

###

In [None]:
# # 5) Plot
# plt.figure(figsize=(8, 6))
# plt.barh(imp_df['feature'].head(10)[::-1],
#          imp_df['avg'].head(10)[::-1])
# plt.xlabel("Mean Feature Importance")
# plt.title("Top 10 Features (averaged RF + ET)")
# plt.tight_layout()
# plt.show()

### Predictions custom

In [None]:
# data = pd.Series(
#     [
#         ["2024-02-30", "2020-03-01", "1982-12-30"],
#         ["104805", "330956", "345609"],
#         ["Male", "Female"],
#         ["men", "women"],
#
#     ],
#     name="values"
# )

In [None]:
#data

In [None]:
# from sherlock.features.preprocessing import extract_features
#
# extract_features(
#     "../check.csv",
#     data
# )
# feature_vectors = pd.read_csv("../check.csv", dtype=np.float32)

In [None]:
#feature_vectors

In [None]:
#train_columns_means = pd.DataFrame(feature_vectors.mean()).transpose()
#feature_vectors.fillna(train_columns_means.iloc[0], inplace=True)

In [None]:
#model = rf_clf

# 1) Ensemble predictions on training set
#y_pred_ensemble = voting_clf.predict(feature_vectors)

#print(y_pred_ensemble)

# 2) RandomForest‐only predictions on training set

### ETC Error analysis

In [None]:

# import numpy as np
# import pandas as pd
#
# # 1. Get probabilities and turn them into labels
# probs = et_clf.predict_proba(X_test)               # shape (n_samples, n_classes)
# preds = predicted_labels(probs, le)                # your helper that maps probs→class
#
# # 2. Make a small DataFrame to hold everything
# df = pd.DataFrame({
#     "true":   y_test,
#     "pred":   preds,
# })
# # 3. Attach each class‐probability as its own column
# for idx, cls in enumerate(le.classes_):
#     df[f"prob_{cls}"] = probs[:, idx]
#
# # 4. Flag the mistakes
# df["correct"] = (df["true"] == df["pred"])
#
# # 5. Slice out just the errors
# errors = df[df["correct"] == False]
#
# # 6. (Optionally) sort by how “confidently wrong” it was
# #    i.e. predicted‐prob minus true‐prob
# errors["confidence_gap"] = errors.apply(
#     lambda row: row[f"prob_{row['pred']}"] - row[f"prob_{row['true']}"], axis=1
# )
# errors = errors.sort_values("confidence_gap", ascending=False)
#
# # 7. Inspect
# print(f"Total errors: {len(errors)} / {len(df)}")
# print(errors[[
#                  "true",
#                  "pred",
#                  "confidence_gap",
#              ] + [f"prob_{cls}" for cls in le.classes_]].head(10))