In [None]:
import mlflow
import pandas as pd
import matplotlib.pyplot as plt
import json
import warnings
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, classification_report
from mlflow.models import infer_signature
from sklearn.dummy import DummyClassifier
import xgboost as xgb
from xgboost import plot_importance
from sklearn.linear_model import LogisticRegression
from challenge.constants import MLFLOW_EXPERIMENT_NAME, MLFLOW_TRACKING_URI, RANDOM_STATE
from challenge.pipeline.etl import etl_pipeline, calculate_target, get_min_diff

warnings.filterwarnings('ignore')

In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
try:
    mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
except Exception as e:
    print(e)
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)

## 0. Load Data

In [None]:
data = pd.read_csv('../data/raw/data.csv')
data.info()

## 4. Training

### 4.a. Data Split (Training and Validation)

In [None]:
data['min_diff'] = data.apply(get_min_diff, axis=1)
data['delay'] = calculate_target(data, 'delay')

In [None]:
data.head()

In [None]:
cols_features = ['OPERA', 'MES', 'TIPOVUELO', 'SIGLADES', 'DIANOM', 'delay']

In [None]:
training_data = shuffle(data[cols_features], random_state = RANDOM_STATE)

In [None]:
unique_vals_per_feature = {}

for col in cols_features:
    unique_vals_per_feature[col] = list(data[col].unique())

unique_vals_per_feature["MES"] = [int(v) for v in unique_vals_per_feature["MES"]]
unique_vals_per_feature["delay"] = [int(v) for v in unique_vals_per_feature["delay"]] 

with open("../data/interim/unique_features.json", 'w') as f:
    json.dump(unique_vals_per_feature, f)

In [None]:
features, target = etl_pipeline(data, 'delay')

In [None]:
features.head()

In [None]:
target.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    features,
    target, 
    test_size = 0.33,
    random_state = RANDOM_STATE,
    stratify = target
)

In [None]:
print(f"train shape: {x_train.shape} | test shape: {x_test.shape}")

In [None]:
y_train['delay'].value_counts('%')*100

In [None]:
y_test['delay'].value_counts('%')*100

In [None]:
with open("../data/interim/feature_columns.json", 'w') as f:
    json.dump({"feature_columns": list(x_train.columns)}, f)

### 4.b. Model Selection

In [None]:
signature = infer_signature(x_test, y_test)

In [None]:
eval_data = x_test.copy()
eval_data["label"] = y_test.copy()
eval_data.head()

In [None]:
train_data = x_train.copy()
train_data["label"] = y_train.copy()
train_data.head()

In [None]:
eval_config = {
    "targets": "label",
    "model_type":"classifier",
    "evaluator_config": {
        'explainability_algorithm': 'permutation',
        'metric_prefix': 'evaluation_'
    }
}

In [None]:
train_config = {
    "targets": "label",
    "model_type":"classifier",
    "evaluator_config": {
        'explainability_algorithm': 'permutation',
        'metric_prefix': 'training_'
    }
}

#### 4.b.0 Baseline Model

In [None]:
desc = "Baseline Model"
run_name = "Baseline Model"
with mlflow.start_run(description=desc, run_name=run_name) as _:
    baseline_model = DummyClassifier(strategy="uniform", random_state=RANDOM_STATE).fit(x_train, y_train)
    
    baseline_model_uri = mlflow.sklearn.log_model(
        baseline_model, "model", signature=signature
    ).model_uri

mlflow.end_run()

#### 4.b.i. XGBoost

In [None]:
desc = "XGBoost"
with mlflow.start_run(description=desc) as _:
    xgb_model = xgb.XGBClassifier(random_state=RANDOM_STATE, learning_rate=0.01)
    xgb_model.fit(x_train, y_train)
    
    candidate_model_uri = mlflow.sklearn.log_model(
        xgb_model, "candidate_model", signature=signature
    ).model_uri
    
    mlflow.evaluate(
        candidate_model_uri,
        eval_data,
        baseline_model=baseline_model_uri,
        **eval_config
    )
    mlflow.evaluate(
        candidate_model_uri,
        train_data,
        baseline_model=baseline_model_uri,
        **train_config
    )
mlflow.end_run()

In [None]:
xgboost_y_preds = xgb_model.predict(x_test)
xgboost_y_preds = [1 if y_pred > 0.5 else 0 for y_pred in xgboost_y_preds]

In [None]:
confusion_matrix(y_test, xgboost_y_preds)

In [None]:
pd.DataFrame(classification_report(y_test, xgboost_y_preds, output_dict=True))

#### 4.b.ii. Logistic Regression

In [None]:
desc = "Logistic Regression"
with mlflow.start_run(description=desc) as _:
    reg_model = LogisticRegression(random_state=RANDOM_STATE, n_jobs=-1)
    reg_model.fit(x_train, y_train)
    candidate_model_uri = mlflow.sklearn.log_model(
        reg_model, "candidate_model", signature=signature
    ).model_uri
    
    mlflow.evaluate(
        candidate_model_uri,
        eval_data,
        baseline_model=baseline_model_uri,
        **eval_config
    )
    mlflow.evaluate(
        candidate_model_uri,
        train_data,
        baseline_model=baseline_model_uri,
        **train_config
    )
mlflow.end_run()

In [None]:
reg_y_preds = reg_model.predict(x_test)

In [None]:
confusion_matrix(y_test, reg_y_preds)

In [None]:
pd.DataFrame(classification_report(y_test, reg_y_preds, output_dict=True))

## 5. Data Analysis: Third Sight

### Feature Importance

In [None]:
plt.figure(figsize = (10,5))
plot_importance(xgb_model)
plt.show()

In [None]:
top_10_features = [
    "OPERA_Latin American Wings", 
    "MES_7",
    "MES_10",
    "OPERA_Grupo LATAM",
    "MES_12",
    "TIPOVUELO_I",
    "MES_4",
    "MES_11",
    "OPERA_Sky Airline",
    "OPERA_Copa Air"
]

### Data Balance

In [None]:
n_y0 = len(y_train[y_train == 0])
n_y1 = len(y_train[y_train == 1])
scale = n_y0/n_y1
print(scale)

## 6. Training with Improvement

### 6.a. Data Split

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(
    features[top_10_features], 
    target, 
    test_size = 0.33, 
    random_state = RANDOM_STATE,
    stratify = target
)

### 6.b. Model Selection

In [None]:
signature = infer_signature(x_test2, y_test2)

In [None]:
eval_data = x_test2.copy()
eval_data["label"] = y_test2.copy()
eval_data.head()

In [None]:
train_data = x_train2.copy()
train_data["label"] = y_train2.copy()
train_data.head()

### 6.b.0 Baseline Model

In [None]:
desc = "Baseline Model Top 10 features"
run_name = "Baseline Model Top 10 features"
with mlflow.start_run(description=desc, run_name=run_name) as _:
    baseline_model = DummyClassifier(strategy="uniform", random_state=RANDOM_STATE).fit(x_train2, y_train2)
    
    baseline_model_uri = mlflow.sklearn.log_model(
        baseline_model, "model", signature=signature
    ).model_uri

mlflow.end_run()

#### 6.b.i. XGBoost with Feature Importance and with Balance

In [None]:
desc = "XGBoost with Feature Importance and with Balance"
with mlflow.start_run(description=desc) as _:
    xgb_model_2 = xgb.XGBClassifier(random_state=RANDOM_STATE, learning_rate=0.01, scale_pos_weight = scale)
    xgb_model_2.fit(x_train2, y_train2)
    
    candidate_model_uri = mlflow.sklearn.log_model(
        xgb_model_2, "candidate_model", signature=signature
    ).model_uri
    
    mlflow.evaluate(
        candidate_model_uri,
        eval_data,
        baseline_model=baseline_model_uri,
        **eval_config
    )
    mlflow.evaluate(
        candidate_model_uri,
        train_data,
        baseline_model=baseline_model_uri,
        **train_config
    )
mlflow.end_run()

In [None]:
xgboost_y_preds_2 = xgb_model_2.predict(x_test2)

In [None]:
confusion_matrix(y_test2, xgboost_y_preds_2)

In [None]:
pd.DataFrame(classification_report(y_test2, xgboost_y_preds_2, output_dict=True))

#### 6.b.ii. XGBoost with Feature Importance but without Balance

In [None]:
desc = "XGBoost with Feature Importance but without Balance"
with mlflow.start_run(description=desc) as _:
    xgb_model_3 = xgb.XGBClassifier(random_state=RANDOM_STATE, learning_rate=0.01)
    xgb_model_3.fit(x_train2, y_train2)
    
    candidate_model_uri = mlflow.sklearn.log_model(
        xgb_model_3, "candidate_model", signature=signature
    ).model_uri
    
    mlflow.evaluate(
        candidate_model_uri,
        eval_data,
        baseline_model=baseline_model_uri,
        **eval_config
    )
    mlflow.evaluate(
        candidate_model_uri,
        train_data,
        baseline_model=baseline_model_uri,
        **train_config
    )
mlflow.end_run()

In [None]:
xgboost_y_preds_3 = xgb_model_3.predict(x_test2)

In [None]:
confusion_matrix(y_test2, xgboost_y_preds_3)

In [None]:
pd.DataFrame(classification_report(y_test2, xgboost_y_preds_3, output_dict=True))

#### 6.b.iii. Logistic Regression with Feature Importante and with Balance

In [None]:
desc = "Logistic Regression with Feature Importante and with Balance"
with mlflow.start_run(description=desc) as _:
    reg_model_2 = LogisticRegression(
        class_weight={1: n_y0/len(y_train), 0: n_y1/len(y_train)},
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    reg_model_2.fit(x_train2, y_train2)
    
    candidate_model_uri = mlflow.sklearn.log_model(
        reg_model_2, "candidate_model", signature=signature
    ).model_uri
    
    mlflow.evaluate(
        candidate_model_uri,
        eval_data,
        baseline_model=baseline_model_uri,
        **eval_config
    )
    mlflow.evaluate(
        candidate_model_uri,
        train_data,
        baseline_model=baseline_model_uri,
        **train_config
    )
mlflow.end_run()

In [None]:
reg_y_preds_2 = reg_model_2.predict(x_test2)

In [None]:
confusion_matrix(y_test2, reg_y_preds_2)

In [None]:
pd.DataFrame(classification_report(y_test2, reg_y_preds_2, output_dict=True))

#### 6.b.iv. Logistic Regression with Feature Importante but without Balance

In [None]:
desc = "Logistic Regression with Feature Importante but without Balance"
with mlflow.start_run(description=desc) as _:
    reg_model_3 = LogisticRegression(random_state=RANDOM_STATE, n_jobs=-1)
    reg_model_3.fit(x_train2, y_train2)

    candidate_model_uri = mlflow.sklearn.log_model(
        reg_model_3, "candidate_model", signature=signature
    ).model_uri
    
    mlflow.evaluate(
        candidate_model_uri,
        eval_data,
        baseline_model=baseline_model_uri,
        **eval_config
    )
    mlflow.evaluate(
        candidate_model_uri,
        train_data,
        baseline_model=baseline_model_uri,
        **train_config
    )
mlflow.end_run()

In [None]:
reg_y_preds_3 = reg_model_3.predict(x_test2)

In [None]:
confusion_matrix(y_test2, reg_y_preds_3)

In [None]:
pd.DataFrame(classification_report(y_test2, reg_y_preds_3, output_dict=True))

## 7. Data Science Conclusions

By looking at the results of the 6 trained models, it can be determined:
- There is no noticeable difference in results between XGBoost and LogisticRegression.
- Does not decrease the performance of the model by reducing the features to the 10 most important.
- Improves the model's performance when balancing classes, since it increases the recall of class "1".

**With this, the model to be productive must be the one that is trained with the top 10 features and class balancing, but which one?**