In [1]:
import numpy as np
from typing import List
from tqdm.auto import tqdm
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import det_curve

import sys
sys.path.append("..")
from src.data import Generator
from src.graph.dist import GraphDist

In [2]:
n: List = [25, 100, 500]
d: np.ndarray = np.linspace(0.1, 10, 30)
mc_iter: int = 50

dist_f: List[List[float]] = [[] for _ in range(len(n))]
dist_h: List[List[float]] = [[] for _ in range(len(n))]
results: List = []
targets: List = []
preds: List=  []
for i in tqdm(range(len(n))):
    gen: Generator = Generator(v=3, alpha=1, size=int(n[i]))
    rows : List = []
    for j in range(len(d)):
        rows.append(
            [
                d[j],
                np.mean(
                    [GraphDist(gen.get_f(), d[j]).calc_metric() for _ in range(mc_iter)]
                ),
                1
            ]
        )
        rows.append(
            [
                d[j],
                np.mean(
                    [GraphDist(gen.get_h(), d[j]).calc_metric() for _ in range(mc_iter)]
                ),
                0
            ]
        )
    rows = pd.DataFrame(rows, columns = ["d", "metric", "target"])

    models: dict = {
    "logreg":LogisticRegression(), 
    "decision_tree": DecisionTreeClassifier(), 
    "ctb": CatBoostClassifier(iterations=50, verbose=False)
    }
    iters: int = 7
    X = rows[["d", "metric"]]
    y = rows["target"]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
    for md in models:
        model_results: List = []
        targets.append(y_val.tolist())
        preds.append([])
        for j in range(iters):
            model = models[md].fit(X_train, y_train)
            model_results.append(f1_score(y_val, model.predict(X_val)))
            preds[-1].append(model.predict_proba(X_val)[:, 1].tolist())
        preds[-1] = np.mean(preds[-1], axis=0)
        results.append([md, n[i], f"{round(np.mean(model_results), 3)}±{round(np.std(model_results), 3)}"])

results = pd.DataFrame(results, columns = ["model", "n", "f1_score"])
results

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,model,n,f1_score
0,logreg,25,0.308±0.0
1,decision_tree,25,0.824±0.0
2,ctb,25,0.933±0.0
3,logreg,100,0.4±0.0
4,decision_tree,100,0.778±0.0
5,ctb,100,0.857±0.0
6,logreg,500,0.667±0.0
7,decision_tree,500,0.824±0.0
8,ctb,500,0.933±0.0


In [3]:
results_stat: dict = {}
for i in range(len(n)):
    for j, md in enumerate(models):
        fpr, _, thresholds = det_curve(targets[i*len(models)+j], preds[i*len(models)+j])
        best_f1: float = 0.0 
        for j2, thr in enumerate(thresholds):
            if f1_score(targets[i*len(models)+j], preds[i*len(models)+j]>thr) > best_f1:
                best_f1 = f1_score(targets[i*len(models)+j], preds[i*len(models)+j]>thr)
                results_stat[f"{md}; n={n[i]}"] = 1-fpr[j2]
results_stat

{'logreg; n=25': 0.2857142857142857,
 'decision_tree; n=25': 0.0,
 'ctb; n=25': 0.8571428571428572,
 'logreg; n=100': 0.2857142857142857,
 'decision_tree; n=100': 0.0,
 'ctb; n=100': 0.7142857142857143,
 'logreg; n=500': 0.5714285714285714,
 'decision_tree; n=500': 0.0,
 'ctb; n=500': 0.8571428571428572}