# Hackathon

## Set up notebook

In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from tabulate import tabulate
from tqdm.auto import tqdm

import wandb


In [None]:
sns.set_theme()
plt.rc('figure', figsize=(8, 4), dpi=100)


### Set up Weights & Biases tracking

To make this interesting, we're tracking everyone's progress via W&B. Don't worry 
Please replace `<Your name here>` by your own name (or a nickname).

If you just want to play around without uploading your runs, set `offline` to `True`

In [None]:
# Used for identifying runs on the W&B dashboard
# name = "<Your name here>"
name = "<Your name here>"

# Whether or not to upload runs to W&B
upload = True

In [None]:
if name == "<Your name here>":
  raise Exception("RTFM")

## Load Data

In [None]:
df = pd.read_csv("datasets/full.csv")
all_feature_names = df.iloc[:, 1:].columns.to_list()

X = df.iloc[:, 1:].to_numpy()
y = df.iloc[:, 0].to_numpy()
feature_names = df.iloc[:, 1:].columns.to_numpy()    
labels = ["not buggy", "buggy"]

In [None]:
# Split data into 70% train and 30% test subsets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    shuffle=True,
)

In [None]:
class_weight = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

## Defining our models

The value of features can differ greatly between models, which is why we test across a number of common and easy-to-train ones. We also look at the effect of regularization, as this can also affect the impact of feature selection.

* Decision Tree
* Random Forrest
* Logistic Regression
* Logistic Regression (with L1 regularization)
* Logistic Regression (with L2 regularization)
* K Nearest Neighbors (with k=15)


_There is no need to change anything here. These are just helper functions to quickly test our data_

In [None]:
classifiers = [
        (DecisionTreeClassifier(), ),
        (RandomForestClassifier(n_estimators=200, min_samples_leaf=10, max_depth=20, class_weight="balanced"), ),
        # (SVC(gamma="auto", probability=True, class_weight=class_weight), ),
        (LogisticRegression(max_iter=1000, class_weight="balanced"), ),
        (LogisticRegression(max_iter=1000, penalty="l1", solver="liblinear", class_weight="balanced"), "_l1"),
        (LogisticRegression(max_iter=1000, penalty="l2", class_weight="balanced"), "_l2"),
        (KNeighborsClassifier(n_neighbors=15), )
    ]
classifiers = [(c[0], c[0].__class__.__name__ + (c[1] if len(c) > 1 else "")) for c in classifiers]

## Setting up our testing code

These functions enable us to easily train our set of classifiers using specific features. They also help us keep track of all experiments

*There's no need to change anything here, either*

In [None]:
def test_classifier(clf, feature_indices):
    if feature_indices is None or len(feature_indices) == 0:
        feature_indices = list(range(X_train.shape[1]))
    clf.fit(X_train[:, feature_indices], y_train)

    # predicted = clf.predict(X_test)
    y_probas = clf.predict_proba(X_test[:, feature_indices])
    y_pred = y_probas.argmax(axis=1)

    scores = {
        f"Accuracy": metrics.accuracy_score(y_test, y_pred),
        f"F1": metrics.f1_score(y_test, y_pred),
        f"Precision": metrics.precision_score(y_test, y_pred),
        f"Recall": metrics.recall_score(y_test, y_pred),
    }
    return scores, y_probas


def test_selection(feature_indices, compare=None, log=True, log_wandb=False):
    scores = {}
    table = []
    for classifier, name in tqdm(classifiers, smoothing=0):
        scores_ = test_classifier(classifier, feature_indices=feature_indices)[0]
        scores[name] = scores_

    # This has no place being one statement, but I just want it to be one
    scores["Average"] = dict(
        zip(
            next(iter(scores.values())).keys(),
            np.mean(list(zip(*(x.values() for x in scores.values()))), axis=1),
        )
    )

    for name, scores_ in scores.items():
        row = [name]
        row += [
            f"{s:0.4f}" + (f"{baseline[name][m]:0.4f}" if compare else "")
            for m, s in scores_.items()
        ]

        row = {
            m: f"{s:0.4f}" + (f" ({s - compare[name][m]:0.4f})" if compare else "")
            for m, s in scores_.items()
        }
        row = {"Classifier": name, **row}
        table.append(row)

        for metric, score in scores_.items():
            rep = f"{score:0.4f}"
            if compare:
                rep += f" ({compare[name][metric]:0.4f})"

    print(tabulate(table, headers="keys"))

    return scores


def test_selector(selector, run_suffix=None, wandb=True, **config_kwargs):
    used_features = selector.get_support()
    used_feature_names = selector.get_feature_names_out(feature_names)
    print(
        f"Selected {sum(used_features)}/{len(feature_names)} features: \n{used_feature_names}"
    )

    scores = test_selection(used_features, compare=baseline)

    if wandb:
        log_scores(
            scores,
            used_feature_names,
            selector.__class__.__name__,
            run_suffix,
            **config_kwargs,
        )


def log_scores(scores, used_features, method_name, method_suffix=None, **config_kwargs):
    print(used_features)
    if all([isinstance(x, int) for x in used_features]):
        print("Detected features as string indices")
        used_feature_names = feature_names[used_features]
    elif all([isinstance(x, str) for x in used_features]):
        print("Detected features as string indices")
        used_feature_names = used_features
    else:
        raise Exception(
            "Whoops, expected used_features to be a list of names or indices"
        )

    used_features_tbl = {name: name in used_feature_names for name in feature_names}
    time = datetime.now().strftime("%H:%M:%S")
    method = method_name + ("_" + method_suffix if method_suffix else "")
    wandb.init(
        project="sogeti-hackathon-feature-selection",
        entity="vincentbrouwers",
        name="-".join([name, method, time]),
        anonymous="allow",
        tags=dict(name=name),
        mode="online" if upload else "offline",
        config=dict(
            features=used_features_tbl,
            name=name,
            method=method,
            method_name=method_name,
            method_suffix=method_suffix,
            **config_kwargs,
        ),
    )

    wscores = {
        f"{clf}_{mtr}": score
        for clf, mtrs in scores.items()
        for mtr, score in mtrs.items()
    }
    wandb.log(
        {
            "feature_count": len(used_feature_names),
            "used_features": wandb.Table(
                columns=["feature", "used"],
                # Just temporary until I have an actual method of filtering features
                data=list(used_features_tbl.items()),
            ),
            # "roc": wandb.plot.roc_curve(y_test, y_probas, labels),
            # "pr": wandb.plot.pr_curve(y_test, y_probas, labels),
            **wscores,
        }
    )
    wandb.finish(quiet=True)


## Feature Selection

In [None]:
baseline = test_selection(None, compare=False)
log_scores(baseline, feature_names, "Baseline")

### Filtering methods

#### Variance threshold

A simple method to filter out superfluous features, is to remove ones with a low variance. The idea here is that these features offer relatively little information. 

*Note: Variance depends on the magnitude our values and our data is not normalized. It's not really possible to set 1 threshold for all features*

---

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html

In [None]:
from sklearn.feature_selection import VarianceThreshold

var = VarianceThreshold(threshold=0.3).fit(X_train)

test_selector(var, threshold=var.threshold)

#### Statistical corrolation threshold

Another method of supervised feature filtering is to calculate corrolation statistics between each feature and the label(s). 

We use the chi-squared test to measure the label's (positive) dependence on features and select the best ones with various thresholding methods.

---

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html<br/>
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html

In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, r_regression, mutual_info_classif

# Select (k=)10 most corrolated features 
selector = SelectKBest(chi2, k=10).fit(X_train, y_train)
test_selector(selector, "chi2")

# Select the features that hit the 50th percentile (median score or better)
selector = SelectPercentile(chi2, percentile=50).fit(X_train, y_train)
test_selector(selector, "chi2")

# # Select (k=)10 most corrolated features 
# selector = SelectKBest(r_regression, k=10).fit(X_train, y_train)
# test_selector(selector, "r_regression")

# # Select the features that hit the 50th percentile (median score or better)
# selector = SelectPercentile(r_regression, percentile=50).fit(X_train, y_train)
# test_selector(selector, "r_regression")

# # Select (k=)10 most corrolated features 
# selector = SelectKBest(mutual_info_classif, k=10).fit(X_train, y_train)
# test_selector(selector, "mutual_info_classif")

# # Select the features that hit the 50th percentile (median score or better)
# selector = SelectPercentile(mutual_info_classif, percentile=50).fit(X_train, y_train)
# test_selector(selector, "mutual_info_classif")

In [None]:
selector

#### Model-specific feature importance

Some models allow us to directly see the contribution of each feature. This allows us to easily remove the fields that our model extracts the least amount of information from. We always test the effect our selection methods on multiple model types. Does te type of reference model the model affect our testing models differently?

---

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html

In [None]:
from sklearn.feature_selection import SelectFromModel

plt.figure(figsize=(10, 6))

# clf1 = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
# plt.barh(feature_names, clf1.feature_importances_)
# selector = SelectFromModel(clf1, threshold="1.5 * mean", prefit=True)

clf2 = LogisticRegression(max_iter=1000).fit(X_train, y_train)
plt.barh(feature_names, clf2.coef_.flatten())
selector = SelectFromModel(clf2, threshold="1 * mean", prefit=True)


test_selector(selector)

### Wrapper

#### Recursive Feature Elimination (RFE)


RFE works in a similar fashion as the previous method, namely that it uses a model's built-in feature significance values to filter out redundant features. Where RFE differs from the "naive" filtering aproach, is that it only removes one feature at a time, after which the entire model is retrained again. Removal of corrolated or inter-dependent features can change the distribution of significance of the remaining features, which this approach mitigates.


---

**Example:**

A simple example of how the result of RFE differs from "naive filtering", is when two features are 100% corrolated. Each of them may be fairly meaningful on their own, though when they're both present, their contribution has to be shared between them. This thus gives them a lower feature importance. 

Let's say we have a model with the following feature importances:

`a=15%`, `b=20%`, `c=50%`, `d=15%`, where `a`, `b` , and `c` are fully independent of eachother, but `d` is 100% corrolated to `a`. 

Naïvely removing the two least contributing features, would leave us with `b=35%` and `c=65%`. If we instead first eliminate `c` (`a` is equally valid), a new model might give us these importances:

`a=30%`, `b=20%`, `c=50%`

This time, `b` will be purged, leaving us with

`a=40%`, `c=60%`

---

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html


In [None]:
from sklearn.feature_selection import RFE

clf = LogisticRegression(max_iter=1000)
selector = RFE(clf, n_features_to_select=5, step=1, verbose=1)
selector = selector.fit(X_train, y_train)

test_selector(selector)

#### Sequential Feature Selection

This Sequential Feature Selector adds (forward selection) or removes (backward selection) features until the desired amount of features is reached. At each stage, it produces candidate feature sets that include (forward) or exclude (backward) one feature compared to the previous stage. The candidate sets are scored by training new models on them and only the best scoring one is kept.

Forwards and backwards do not have to yield the same feature sets, though none is necessarily better. Their performance can differ depending on the amount and size of models that need to be trained to reach the desired amount of features.

---

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

direction = "forward"
# direction = "backward"

clf = LogisticRegression(max_iter=1000)
selector = SequentialFeatureSelector(clf, n_features_to_select=5, direction=direction, scoring="f1", n_jobs=-1)

selector = selector.fit(X_train, y_train)

test_selector(selector)

### Own selection

Now try it for yourself. Is there another method of feature selection you would like to try? Do yo think combining other methods might 

In [None]:
used_features = [
    0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
]

scores = test_selection(used_features, compare=baseline)

# Uncomment this as soon as you want to upload your run to Weights&Biases. 
# log_scores(scores, used_features, "Custom")
