In [None]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

### Task 1
1. Load the two datasets.
2. Select a classifier, train a model on dataset A. 
3. Evaluate the generalization error.

In [2]:
# Loading the datasets
X_data_A = pd.read_csv(r"features_dataset_A.csv", header=None, delimiter=';').to_numpy()
y_data_A = pd.read_csv(r"targets_dataset_A.csv", header=None).to_numpy()

X_data_B = pd.read_csv(r"features_dataset_B.csv", header=None, delimiter=';').to_numpy()
y_data_B = pd.read_csv(r"targets_dataset_B.csv", header=None).to_numpy()

# 80/20 train test split
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_data_A, y_data_A, test_size=0.2, random_state=42)

In [9]:
# Training a standard random forest classsifier
rand_forest_model = DecisionTreeClassifier(random_state=42)

# K fold cross validation with
k=100
rand_forest_mean_cross_val = np.mean(cross_val_score(rand_forest_model, X_train_A, y_train_A, cv=k, scoring='roc_auc'))

# Fitting the model and making predictions
rand_forest_model_A = rand_forest_model.fit(X_train_A, y_train_A)
predictions_A = rand_forest_model_A.predict(X_test_A)

# Getting the R2 score on the predictions
rand_forest_rocauc = roc_auc_score(y_true=y_test_A, y_score=predictions_A)

# Printing the values
print(f"*** Evaluation metrics for Random forest classifier ***\n")
print(f"    ROC/AUC = {rand_forest_rocauc:.4f}")
print(f"    Mean cross validation with ROC/AUC scores and k={k} folds = {rand_forest_mean_cross_val:.4f}\n")
print(f"Based on this very basic evaluation, we can see that the model has a good fit since the ROC/AUC scoreas are close to 1 and do not differ much.")

*** Evaluation metrics for Random forest classifier ***

    ROC/AUC = 0.8546
    Mean cross validation with ROC/AUC scores and k=100 folds = 0.8697

Based on this very basic evaluation, we can see that the model has a good fit since the ROC/AUC scoreas are close to 1 and do not differ much.


### Task 2
1. Test the model on the dataset B (production dataset).
2. How well is the trained classifier performing on the production dataset? Is it better or worse than on the generalization error 
of dataset A?

In [4]:
# code here

### Task 3
1. Measure how the features are changing over time (concept drift) with the Kolmogorov-Smirnov Test to see if the distribution of the features remains constant or if they are changing over time. 
    - Your reference distribution can be the 250 first values of dataset A, called FEATURE_SAMPLES_DS_A. 
    - First try to do a K-S Test between FEATURE_SAMPLES_DS_A and for indeces 0 to 250 of the production dataset B. 
    - Do they having the same distribution? Can we reject the null hypothesis?
2. Do a K-S Test between FEATURE_SAMPLES_DS_A and for indeces 250 to 500 of the production dataset B. 
    - Are they having the same distribution? 
    - Can we reject the null hypothesis?
3. Try other windows in the production dataset B. 
    - Estimate where the concept drift starts (at which index in dataset B)? 
    - Which type of concept shift occurs: abrupt gradual or incremental?

In [5]:
# code here

### Task 4
1. Retrain the model with parts of dataset B occurring after the start of the concept drift (which could be estimated in the third task). 
    - How is the generalization error now? Is it improved?