## Train classifiers

If training data has been extracted from the source datasets by running the `00a-extract_training_data.ipynb` and `00b-extract_grid_data.ipynb` notebooks, set the `use_extracted_data` variable below to `True` to use this dataset instead of the pre-prepared training data from the [Zenodo repository](https://zenodo.org/record/8157691).

In [4]:
use_extracted_data = True


In [1]:
import os
import time
import warnings
from datetime import timedelta

import pandas as pd
from joblib import dump
from pulearn.bagging import BaggingPuClassifier
from sklearn.base import clone
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVC

from lib.check_files import check_prepared_data
from lib.pu import (
    BASE_MODELS,
    PU_PARAMS,
    UNUSED_COLUMNS,
    downsample_unlabelled,
)

# Suppress FutureWarning for some versions of Scikit-learn
%env PYTHONWARNINGS=ignore::FutureWarning

warnings.simplefilter("ignore", FutureWarning)




In [2]:
# Random seed for reproducibility
random_seed = 1234

# Number of jobs used to train model
n_jobs = int(os.environ.get("N_JOBS", 8))

# Parameters for models
imputer_params = {
    "random_state": random_seed,
    "add_indicator": False,
}
rf_params = {"random_state": random_seed}
pu_params = {
    "n_jobs": n_jobs,
    "random_state": random_seed,
    **PU_PARAMS
}
svm_params = {
    "kernel": "rbf",
    "probability": True,
    "random_state": random_seed,
}


In [5]:
# Input/output files
if use_extracted_data:
    data_dir = "extracted_data"
else:
    data_dir = "prepared_data"
    check_prepared_data(data_dir, verbose=True)
data_filename = os.path.join(data_dir, "training_data.csv")

output_dir = os.path.join("outputs", "Americas")
os.makedirs(output_dir, exist_ok=True)

pu_dir = os.path.join(output_dir, "PU")
os.makedirs(pu_dir, exist_ok=True)
pu_filename = os.path.join(pu_dir, "classifier.joblib")

svm_dir = os.path.join(output_dir, "SVM")
os.makedirs(svm_dir, exist_ok=True)
svm_filename = os.path.join(svm_dir, "classifier.joblib")


In [6]:
data = pd.read_csv(data_filename)

# Restrict training data to the Americas
data = data[data["region"].isin({"North America", "South America"})]

# Equal number of positive and unlabelled samples
negatives = data[data["label"] == "negative"]
tmp = data[data["label"] != "negative"]
tmp_downsampled = downsample_unlabelled(
    tmp,
    random_state=random_seed,
)
combined = pd.concat((tmp_downsampled, negatives))
cleaned = combined.drop(columns=list(UNUSED_COLUMNS), errors="ignore")
del tmp, tmp_downsampled

print(combined.groupby(["region", "label"]).size())


region         label     
North America  negative       57
               positive      170
               unlabelled    158
South America  negative      979
               positive      126
               unlabelled    138
dtype: int64


### Train the PU classifier

#### All data

In [8]:
# Wrangle training data
train_pu = cleaned[cleaned["label"].isin({"positive", "unlabelled"})]
x_pu = train_pu.drop(columns="label")
y_pu = train_pu["label"].replace({"positive": 1, "unlabelled": 0})

# Use a random forest as the base classifier
base_model = clone(BASE_MODELS["randomforest"])
base_model.set_params(**rf_params)

# Impute missing values and scale before training
pu_model = make_pipeline(
    IterativeImputer(**imputer_params),
    RobustScaler(),
    #BaggingPuClassifier(base_estimator=base_model, **pu_params),
    BaggingPuClassifier(estimator=base_model, **pu_params),
)
pu_model.set_output(transform="pandas")

# Train model
t0 = time.time()
pu_model.fit(x_pu, y_pu)

# Save to file
dump(pu_model, pu_filename, compress=True)
duration = timedelta(seconds=time.time() - t0)
print(f"Model training time: {duration}")


Model training time: 0:00:03.587187


#### Separate regions

In [9]:
for region, subset in combined.groupby("region"):
    region = str(region)
    if (subset["label"] == "positive").sum() < 50:
        print(f"Skipping region: {region}")
        continue
    print(f"Region: {region}")
    r = "_".join(region.lower().split())
    output_subset = os.path.join(
        pu_dir,
        f"classifier_{r}.joblib",
    )

    subset = subset[subset["label"].isin({"positive", "unlabelled"})]
    subset = subset.drop(columns=list(UNUSED_COLUMNS), errors="ignore")
    print(subset.groupby("label").size())
    x_pu_subset = subset.drop(columns="label")
    y_pu_subset = subset["label"].replace({"positive": 1, "unlabelled": 0})

    pu_model_subset = clone(pu_model)
    t0 = time.time()
    pu_model_subset.fit(x_pu_subset, y_pu_subset)
    dump(pu_model, output_subset, compress=True)
    duration = timedelta(seconds=time.time() - t0)
    print(f"Model training time: {duration}")

    print("")


Region: North America
label
positive      170
unlabelled    158
dtype: int64
Model training time: 0:00:02.289679

Region: South America
label
positive      126
unlabelled    138
dtype: int64
Model training time: 0:00:02.378863



### Train the SVM classifier

#### All data

In [10]:
# Wrangle training data
train_svm = cleaned[cleaned["label"].isin({"positive", "negative"})]
x_svm = train_svm.drop(columns="label")
y_svm = train_svm["label"].replace({"positive": 1, "negative": 0})

# Impute missing values and scale before training
svm_model = make_pipeline(
    IterativeImputer(**imputer_params),
    RobustScaler(),
    SVC(**svm_params),
)
svm_model.set_output(transform="pandas")

# Train model
t0 = time.time()
svm_model.fit(x_svm, y_svm)

# Save to file
dump(svm_model, svm_filename, compress=True)
duration = timedelta(seconds=time.time() - t0)
print(f"Model training time: {duration}")


Model training time: 0:00:00.200124


#### Separate regions

In [11]:
for region, subset in combined.groupby("region"):
    region = str(region)
    if (subset["label"] == "positive").sum() < 50:
        print(f"Skipping region: {region}")
        continue
    print(f"Region: {region}")
    r = "_".join(region.lower().split())
    output_subset = os.path.join(
        svm_dir,
        f"classifier_{r}.joblib",
    )

    subset = subset[subset["label"].isin({"positive", "negative"})]
    subset = subset.drop(columns=list(UNUSED_COLUMNS), errors="ignore")
    print(subset.groupby("label").size())
    x_svm_subset = subset.drop(columns="label")
    y_svm_subset = subset["label"].replace({"positive": 1, "negative": 0})

    svm_model_subset = clone(svm_model)
    t0 = time.time()
    svm_model_subset.fit(x_svm_subset, y_svm_subset)
    dump(svm_model_subset, output_subset, compress=True)
    duration = timedelta(seconds=time.time() - t0)
    print(f"Model training time: {duration}")

    print("")


Region: North America
label
negative     57
positive    170
dtype: int64
Model training time: 0:00:00.036983

Region: South America
label
negative    979
positive    126
dtype: int64
Model training time: 0:00:00.071553

