In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import shap
from sklearn.inspection import permutation_importance
import numpy as np
from scipy.stats import mode
from lime.lime_tabular import LimeTabularExplainer

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
df_2017 = pd.read_csv("2017.csv")
df_2019 = pd.read_csv("2019.csv")

feature_cols = df_2017.columns[:-1].tolist()

In [None]:
def prepare_data(df, feature_cols):
    X = df[feature_cols]
    y = df["Label"]
    return X, y

X_2017, y_2017 = prepare_data(df_2017, feature_cols)
X_2019, y_2019 = prepare_data(df_2019, feature_cols)

# Skalowanie
scaler = StandardScaler()
X_2017_scaled = scaler.fit_transform(X_2017)
X_2019_scaled = scaler.transform(X_2019)

# Przygotuj próbkę 0.1% z 2019 do fine-tuningu
X_ftune, _, y_ftune, _ = train_test_split(X_2019, y_2019, test_size=0.999, stratify=y_2019, random_state=42)
X_ftune_scaled = scaler.transform(X_ftune)

# Definicja modeli
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [None]:
for name, model in models.items():
    print(f"\n=== {name} ===")

    # Trening bazowy
    model.fit(X_2017_scaled, y_2017)
    y_pred_baseline = model.predict(X_2019_scaled)
    print(">>> Bez fine-tuningu:")
    print(classification_report(y_2019, y_pred_baseline))

    # Fine-tuning
    model.fit(X_ftune_scaled, y_ftune)
    y_pred_finetuned = model.predict(X_2019_scaled)
    print(">>> Po fine-tuningu (0.1% zbioru 2019):")
    print(classification_report(y_2019, y_pred_finetuned))


=== Logistic Regression ===
>>> Bez fine-tuningu:
              precision    recall  f1-score   support

           0       0.49      0.99      0.66     28017
           1       0.85      0.05      0.09     30000

    accuracy                           0.50     58017
   macro avg       0.67      0.52      0.37     58017
weighted avg       0.68      0.50      0.36     58017

>>> Po fine-tuningu (0.1% zbioru 2019):
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     28017
           1       0.98      0.94      0.96     30000

    accuracy                           0.96     58017
   macro avg       0.96      0.96      0.96     58017
weighted avg       0.96      0.96      0.96     58017


=== Random Forest ===
>>> Bez fine-tuningu:
              precision    recall  f1-score   support

           0       0.48      1.00      0.65     28017
           1       0.22      0.00      0.00     30000

    accuracy                           0.48   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


>>> Bez fine-tuningu:
              precision    recall  f1-score   support

           0       0.48      1.00      0.65     28017
           1       0.12      0.00      0.00     30000

    accuracy                           0.48     58017
   macro avg       0.30      0.50      0.33     58017
weighted avg       0.29      0.48      0.31     58017



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


>>> Po fine-tuningu (0.1% zbioru 2019):
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     28017
           1       0.98      0.97      0.98     30000

    accuracy                           0.97     58017
   macro avg       0.97      0.98      0.97     58017
weighted avg       0.98      0.97      0.97     58017



In [None]:
def prepare_data(df, feature_cols):
    X = df[feature_cols]
    y = df["Label"]
    return X, y

X_2017, y_2017 = prepare_data(df_2017, feature_cols)
X_2019, y_2019 = prepare_data(df_2019, feature_cols)

# Skalowanie
scaler = StandardScaler()
X_2019_scaled = scaler.fit_transform(X_2019)
X_2017_scaled = scaler.transform(X_2017)

# Przygotuj próbkę 0.1% z 2017 do fine-tuningu
X_ftune, _, y_ftune, _ = train_test_split(X_2017, y_2017, test_size=0.999, stratify=y_2017, random_state=42)
X_ftune_scaled = scaler.transform(X_ftune)

# Definicja modeli
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Iteruj przez modele
for name, model in models.items():
    print(f"\n=== {name} ===")

    # Trening bazowy
    model.fit(X_2019_scaled, y_2019)
    y_pred_baseline = model.predict(X_2017_scaled)
    print(">>> Bez fine-tuningu:")
    print(classification_report(y_2017, y_pred_baseline))

    # Fine-tuning
    model.fit(X_ftune_scaled, y_ftune)
    y_pred_finetuned = model.predict(X_2017_scaled)
    print(">>> Po fine-tuningu (0.1% zbioru 2017):")
    print(classification_report(y_2017, y_pred_finetuned))


=== Logistic Regression ===
>>> Bez fine-tuningu:
              precision    recall  f1-score   support

           0       0.47      0.74      0.57     97718
           1       0.65      0.36      0.47    128027

    accuracy                           0.53    225745
   macro avg       0.56      0.55      0.52    225745
weighted avg       0.57      0.53      0.51    225745

>>> Po fine-tuningu (0.1% zbioru 2017):
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     97718
           1       0.96      1.00      0.98    128027

    accuracy                           0.97    225745
   macro avg       0.98      0.97      0.97    225745
weighted avg       0.98      0.97      0.97    225745


=== Random Forest ===
>>> Bez fine-tuningu:
              precision    recall  f1-score   support

           0       0.43      1.00      0.60     97718
           1       0.18      0.00      0.00    128027

    accuracy                           0.43   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


>>> Bez fine-tuningu:
              precision    recall  f1-score   support

           0       0.49      0.77      0.60     97718
           1       0.68      0.38      0.49    128027

    accuracy                           0.55    225745
   macro avg       0.58      0.58      0.54    225745
weighted avg       0.60      0.55      0.54    225745



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


>>> Po fine-tuningu (0.1% zbioru 2017):
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     97718
           1       0.99      1.00      0.99    128027

    accuracy                           0.99    225745
   macro avg       0.99      0.99      0.99    225745
weighted avg       0.99      0.99      0.99    225745

