# Samples codes from the README.rst

## Binary oversampling

In [1]:
import smote_variants as sv
import imbalanced_databases as imbd

dataset= imbd.load_iris0()
X, y= dataset['data'], dataset['target']

oversampler= sv.distance_SMOTE()

# X_samp and y_samp contain the oversampled dataset
X_samp, y_samp= oversampler.sample(X, y)

2022-08-16 15:45:41.383916: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-16 15:45:41.392465: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-16 15:45:41.392491: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-08-16 15:45:49,269:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distan

## Multiclass oversampling

In [2]:
import smote_variants as sv
import sklearn.datasets as datasets

dataset= datasets.load_wine()
X, y= dataset['data'], dataset['target']

oversampler= sv.MulticlassOversampling(oversampler='distance_SMOTE',
                                       oversampler_params={'random_state': 5})

# X_samp and y_samp contain the oversampled dataset
X_samp, y_samp= oversampler.sample(X, y)

2022-08-16 15:45:49,586:INFO:MulticlassOversampling: Running multiclass oversampling with strategy eq_1_vs_many_successive
2022-08-16 15:45:49,587:INFO:MulticlassOversampling: Sampling minority class with label: 0
2022-08-16 15:45:49,589:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': 5, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:45:49,592:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:45:49,593:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:45:49,601:INFO:distance_SMOTE: simplex sampling with n_dim 2
2022-08-16 15:45:49,603:INFO:MulticlassOversampling: Sampling minority class with label: 2
2022-08-16 15:45:49,605:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.244

## Selection of the best oversampler

In [3]:
import os.path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import smote_variants as sv
import sklearn.datasets as datasets

dataset= datasets.load_breast_cancer()

dataset= {'data': dataset['data'], 
          'target': dataset['target'], 
          'name': 'breast_cancer'}

classifiers = [('sklearn.neighbors', 'KNeighborsClassifier', {}),
               ('sklearn.tree', 'DecisionTreeClassifier', {})]

oversamplers = sv.queries.get_all_oversamplers(n_quickest=2)

os_params = sv.queries.generate_parameter_combinations(oversamplers, 
                                                       n_max_comb=2)

# samp_obj and cl_obj contain the oversampling and classifier objects which give the
# best performance together
samp_obj, cl_obj= sv.evaluation.model_selection(dataset=dataset,
                                                oversamplers=os_params,
                                                classifiers=classifiers,
                                                validator_params={'n_splits': 2,
                                                                  'n_repeats': 1},
                                                n_jobs= 5)

# training the best techniques using the entire dataset
X_samp, y_samp= samp_obj.sample(dataset['data'], 
                                dataset['target'])
cl_obj.fit(X_samp, y_samp)

2022-08-16 15:45:53.103528: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-16 15:45:53.111998: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-16 15:45:53.112817: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-08-16 15:45:53.170340: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-16 15:45:53.1843

## Integration with sklearn pipelines

In [4]:
import smote_variants as sv
import imblearn.datasets as imb_datasets

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

libras= imb_datasets.fetch_datasets()['libras_move']
X, y= libras['data'], libras['target']

oversampler = ('smote_variants', 'MulticlassOversampling', 
                {'oversampler': 'distance_SMOTE', 'oversampler_params': {}})

classifier = ('sklearn.neighbors', 'KNeighborsClassifier', {})

# Constructing a pipeline which contains oversampling and classification 
# as the last step.
model= Pipeline([('scale', StandardScaler()), 
                 ('clf', sv.classifiers.OversamplingClassifier(oversampler, classifier))])

model.fit(X, y)

2022-08-16 15:51:40,282:INFO:MulticlassOversampling: Running multiclass oversampling with strategy eq_1_vs_many_successive
2022-08-16 15:51:40,310:INFO:MulticlassOversampling: Sampling minority class with label: 1
2022-08-16 15:51:40,318:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:40,320:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:40,323:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:40,446:INFO:distance_SMOTE: simplex sampling with n_dim 2


## Integration with sklearn grid search

In [5]:
import smote_variants as sv
import imblearn.datasets as imb_datasets

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

libras= imb_datasets.fetch_datasets()['libras_move']
X, y= libras['data'], libras['target']

oversampler = ('smote_variants', 'MulticlassOversampling', 
                {'oversampler': 'distance_SMOTE', 'oversampler_params': {}})

classifier = ('sklearn.neighbors', 'KNeighborsClassifier', {})

# Constructing a pipeline with oversampling and classification as the last step
model= Pipeline([('scale', StandardScaler()), 
                 ('clf', sv.classifiers.OversamplingClassifier(oversampler, classifier))])

param_grid= {'clf__oversampler':[('smote_variants', 'distance_SMOTE', {'proportion': 0.5}),
                                 ('smote_variants', 'distance_SMOTE', {'proportion': 1.0}),
                                 ('smote_variants', 'distance_SMOTE', {'proportion': 1.5})]}

# Specifying the gridsearch for model selection
grid= GridSearchCV(model, 
                   param_grid=param_grid, 
                   cv=3, 
                   n_jobs=1, 
                   verbose=2, 
                   scoring='accuracy')

# Fitting the pipeline
grid.fit(X, y)

2022-08-16 15:51:42,019:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:42,021:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:42,024:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:42,105:INFO:distance_SMOTE: simplex sampling with n_dim 2


Fitting 3 folds for each of 3 candidates, totalling 9 fits


2022-08-16 15:51:42,355:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:42,360:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:42,362:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:42,434:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 0.5}); total time=   0.3s


2022-08-16 15:51:42,714:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:42,716:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:42,721:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:42,895:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 0.5}); total time=   0.4s


2022-08-16 15:51:43,099:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:43,100:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:43,101:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:43,175:INFO:distance_SMOTE: simplex sampling with n_dim 2
2022-08-16 15:51:43,267:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:43,269:INFO:NearestNeighborsWithMetricTensor: NN fitting wit

[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 0.5}); total time=   0.4s
[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.0}); total time=   0.2s


2022-08-16 15:51:43,414:INFO:distance_SMOTE: simplex sampling with n_dim 2
2022-08-16 15:51:43,529:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:43,530:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:43,531:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:43,651:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.0}); total time=   0.3s


2022-08-16 15:51:43,890:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:43,891:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:43,892:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:43,906:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.0}); total time=   0.4s


2022-08-16 15:51:44,122:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:44,123:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:44,124:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:44,144:INFO:distance_SMOTE: simplex sampling with n_dim 2
2022-08-16 15:51:44,230:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:44,231:INFO:NearestNeighborsWithMetricTensor: NN fitting wit

[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.5}); total time=   0.2s
[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.5}); total time=   0.1s


2022-08-16 15:51:44,364:INFO:distance_SMOTE: simplex sampling with n_dim 2
2022-08-16 15:51:44,498:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'uniform', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'distance_SMOTE'}")
2022-08-16 15:51:44,499:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2022-08-16 15:51:44,500:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2022-08-16 15:51:44,645:INFO:distance_SMOTE: simplex sampling with n_dim 2


[CV] END clf__oversampler=('smote_variants', 'distance_SMOTE', {'proportion': 1.5}); total time=   0.3s
