# Techniques for Unbalanced datasets
(by Mario Martin)

In [6]:
#Let's generate an unbalanced dataset:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

rng = np.random.RandomState(0)
n_samples_1 = 100000
n_samples_2 = 8000
X = np.r_[1.5 * rng.randn(n_samples_1, 2),
          0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y = np.array([0] * (n_samples_1) + [1] * (n_samples_2))

X, y = datasets.make_classification(n_classes=2, class_sep=0.1, weights=[0.9, 0.1],
                           n_informative=3, n_redundant=0, flip_y=0,
                           n_features=5, n_clusters_per_class=2,
                           n_samples=5000, random_state=10)


# Let's separate data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=0.5,stratify=y)

# Proportion of examples in the smaller class (class 1) is the following

print("Percentage of examples of the smaller class (class 1): {0:.2f}%".format(100*np.sum(y==1)/(np.sum(y==0)+np.sum(y==1))))

print('train: ',100*np.sum(y_train==1)/(np.sum(y_train==0)+np.sum(y_train==1)), '\ntest: ',100*np.sum(y_test==1)/(np.sum(y_test==0)+np.sum(y_train==1)))

Percentage of examples of the smaller class (class 1): 10.00%
train:  10.0 
test:  10.0


In [2]:
# Let's optimize k-nn for this dataset
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import sklearn.model_selection as cv    # Pel Cross-validation
import sklearn.neighbors as nb          # Per fer servir el knn

# Maximizing Accuracy
params = {'n_neighbors':list(range(1,30,2)), 'weights':('uniform','distance')}
knc = nb.KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=10,n_jobs=-1)  # If cv is integer, by default is Stratifyed 
clf.fit(X_train, y_train)
print("Best Params=",clf.best_params_, "\nAccuracy on 10-fold cross-validation=", clf.best_score_)
parval=clf.best_params_
knc = nb.KNeighborsClassifier(n_neighbors=parval['n_neighbors'],weights=parval['weights'])
knc.fit(X_train, y_train)
pred=knc.predict(X_test)

print("Accuracy on test set:",accuracy_score(y_test, pred))

Best Params= {'n_neighbors': 9, 'weights': 'distance'} 
Accuracy on 10-fold cross-validation= 0.9168
Accuracy on test set: 0.8988


Good accuracy, but in these cases we have to observe the confusion matrix and focus not on accuracy but on recall and precision measures of the smaller class (or the f-measure that is the geometrical combination of both).


In [3]:
print("confusion matrix on test set:\n",confusion_matrix(y_test, pred))
print("\n ",classification_report(y_test, pred))

confusion matrix on test set:
 [[2220   10]
 [ 243   27]]

               precision    recall  f1-score   support

          0       0.90      1.00      0.95      2230
          1       0.73      0.10      0.18       270

avg / total       0.88      0.90      0.86      2500



Oops. f1-score of 0.18 is very low. Notice that recall for class 1 is only 0.1 (in second row of confussion matrix, we can see that we only catch 27 of 243+27 cases, that is, 0.1 of recall)... Too bad.

This effect is expected In higly unbalanced datasets. In order to improve these values, there are several techniques that can be applied.


## 1- Changing performance function

When optimizing parameters, try to focus on the optimization these measures (f-score, recall or precision) instead of optimizing accuracy.


In [4]:
# Let's optimize f1_score by creationg an scoring function "f_scorer"

from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score, make_scorer

# Maximizing f1_score for class 1
f_scorer = make_scorer(f1_score,pos_label=1)

params = {'n_neighbors':list(range(1,30,2)), 'weights':('uniform','distance')}
knc = nb.KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=10,n_jobs=-1,scoring=f_scorer)  # If cv is integer, by default is Stratifyed 
clf.fit(X_train, y_train)
print("Best Params=",clf.best_params_, "\nF-score on 10-fold crossvalidation=", clf.best_score_)
parval=clf.best_params_
knc = nb.KNeighborsClassifier(n_neighbors=parval['n_neighbors'],weights=parval['weights'])
knc.fit(X_train, y_train)
pred=knc.predict(X_test)
print("\nConfusion matrix on test set:\n",confusion_matrix(y_test, pred))
print("\nAccuracy on test set:",accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

Best Params= {'n_neighbors': 1, 'weights': 'uniform'} 
F-score on 10-fold crossvalidation= 0.375360380407

Confusion matrix on test set:
 [[2098  132]
 [ 176   94]]

Accuracy on test set: 0.8768
             precision    recall  f1-score   support

          0       0.92      0.94      0.93      2230
          1       0.42      0.35      0.38       270

avg / total       0.87      0.88      0.87      2500



Now we have f1-score for class 1 of 0.38 instead of 0.18. Nice improvement.

This method can be applied when we have parameters to adjust... but no in Naive Bayes, for instance, because it has no parameters to adjust. 


## 2- Finding good threshold function (recomended approach)

When running a classifier, instead of returning hard decisions about belonging to a class or the other, return the probability of belonging to the minority class. Probabilities can be estimated for most algorithms implemented in sklearn. Once we have the probabilities we will adjust the threshold on these probabilities to decide where belongs each element. The threshold will be set so it maximizes the f-measure. 


In [5]:
# Let's try Naive Bayes on hard decisions

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test, pred))


             precision    recall  f1-score   support

          0       0.90      1.00      0.95      2230
          1       0.85      0.12      0.21       270

avg / total       0.90      0.90      0.87      2500



We have an f1-score of only 0.21. Let's try to impove it by selecting a good threshold for probability values. 

In [6]:
from sklearn.model_selection import StratifiedKFold

def filterp(th,ProbClass1):
    """ Given a treshold "th" and a set of probabilies of belonging to class 1 "ProbClass1", return predictions """ 
    y=np.zeros(ProbClass1.shape[0])
    for i,v in enumerate(ProbClass1):
        if ProbClass1[i]>th:
            y[i]=1
    return y  

clf = GaussianNB()
lth=[]

# We do a 10 fold crossvalidation with 10 iterations
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X, y):
    X_train2, X_test2 = X[train_index], X[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]

    # Train with the training data of the iteration 
    clf.fit(X_train2, y_train2)
    # Obtaining probablity predictions for test data of the iterarion
    probs = clf.predict_proba(X_test2)
    # Collect probabilities of belonging to class 1
    ProbClass1 = probs[:,1]
    # Sort probabilities and generate pairs (threshold, f1-for-that-threshold) 
    res = np.array([[th,f1_score(y_test2,filterp(th,ProbClass1),pos_label=1)] for th in np.sort(ProbClass1)])

    # Uncomment the following lines if you want to plot at each iteration how f1-score evolves increasing the threshold 
    #plt.plot(res[:,0],res[:,1])
    #plt.show()

    # Find the threshold that has maximum value of f1-score
    maxF = np.max(res[:,1])
    optimal_th = res[res[:,1]==maxF,0]
    
    # Store the optimal threshold found for the current iteration
    lth.append(optimal_th)

# Compute the average threshold for all 10 iterations    
thdef = np.mean(lth)
print("Selected threshold in 10-fold cross validation:", thdef)
print()

# Train a classifier with the whole training data 
clf.fit(X_train, y_train)
# Obtain probabilities for data on test set
probs = clf.predict_proba(X_test)
# Generate predictions using probabilities and threshold found on 10 folds cross-validation
pred = filterp(thdef,probs[:,1])
# Print results with this prediction vector
print(classification_report(y_test, pred))

# Ignore warnings explaining that in some iterations f1 score is 0

  'precision', 'predicted', average, warn_for)


Selected threshold in 10-fold cross validation: 0.209974976216

             precision    recall  f1-score   support

          0       0.94      0.95      0.95      2230
          1       0.55      0.50      0.53       270

avg / total       0.90      0.90      0.90      2500



We increased from an f1 of Naive Bayes of 0.21 to 0.53 by adjusting the probability threshold.

The same trick can be done for any algorithm implemented in python that has method "predict_proba" implemented. It also can be used when the fuction you want to optimize is not f-score of one class but any other you want. the only thing you have to do is change the calls to f1_score by your function. 

## 3- Sampling approach:

a) Oversampling of the minority class, 

b) undersampling of the majority class and,

c) artificial generation of examples for the minoiry class

To do that, you have to previously keep a test set with the original proportion of data to obtain the performance measure (because them have to be computed on a dataset with the true distribution of examples on each class. 

In [7]:
# The asiest way to deal with unbalanced datasets with sampling procedures is to use the imblearn package in python
#   http://contrib.scikit-learn.org/imbalanced-learn/stable/index.html
# Instalation mcan be done with pip or conda (if you use conda, this is recomended). Instalation is done with one
# of the following commands:
#    conda install -c glemaitre imbalanced-learn
#    pip install -U imbalanced-learn

In [8]:
# Solution doing oversampling of the smaller class

from imblearn.over_sampling import RandomOverSampler
from imblearn import pipeline as pl

RANDOM_STATE=42

pipeline = pl.make_pipeline(GaussianNB())
# Train the classifier with balancing
pipeline.fit(X_train, y_train)
# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
print("\n** Results for Naive Bayes")
print(classification_report(y_test, y_pred_bal))
print("Confusion matrix on test set:\n",confusion_matrix(y_test, y_pred_bal))

pipeline = pl.make_pipeline(RandomOverSampler(random_state=RANDOM_STATE), GaussianNB())
# Train the classifier with balancing
pipeline.fit(X_train, y_train)
# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
print("\n** Results OVERSAMPLING randomly")
print(classification_report(y_test, y_pred_bal))
print("Confusion matrix on test set:\n",confusion_matrix(y_test, y_pred_bal))



** Results for Naive Bayes
             precision    recall  f1-score   support

          0       0.90      1.00      0.95      2230
          1       0.85      0.12      0.21       270

avg / total       0.90      0.90      0.87      2500

Confusion matrix on test set:
 [[2224    6]
 [ 237   33]]

** Results OVERSAMPLING randomly
             precision    recall  f1-score   support

          0       0.98      0.71      0.82      2230
          1       0.27      0.89      0.41       270

avg / total       0.90      0.73      0.78      2500

Confusion matrix on test set:
 [[1585  645]
 [  31  239]]


In [9]:
# Solution doing undersampling of the larger class. There are several method implemented. 
# I recommend any of the following ones:

from imblearn.under_sampling import RandomUnderSampler,CondensedNearestNeighbour,InstanceHardnessThreshold

pipeline = pl.make_pipeline(RandomUnderSampler(random_state=RANDOM_STATE), GaussianNB())
# Train the classifier with balancing
pipeline.fit(X_train, y_train)
# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
print("\n** Results for UNDERSAMPLING with method Random")
print(classification_report(y_test, y_pred_bal))
print("Confusion matrix on test set:\n",confusion_matrix(y_test, y_pred_bal))

pipeline = pl.make_pipeline(InstanceHardnessThreshold(random_state=RANDOM_STATE), GaussianNB())
# Train the classifier with balancing
pipeline.fit(X_train, y_train)
# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
print("\n** Results for UNDERSAMPLING with method Instance Hardness threshold")
print(classification_report(y_test, y_pred_bal))
print("Confusion matrix on test set:\n",confusion_matrix(y_test, y_pred_bal))
    
pipeline = pl.make_pipeline(CondensedNearestNeighbour(random_state=RANDOM_STATE), GaussianNB())
# Train the classifier with balancing
pipeline.fit(X_train, y_train)
# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
print("\n** Results for UNDERSAMPLING with method Condensed Nearest Neighbour")
print(classification_report(y_test, y_pred_bal))
print("Confusion matrix on test set:\n",confusion_matrix(y_test, y_pred_bal))



** Results for UNDERSAMPLING with method Random
             precision    recall  f1-score   support

          0       0.98      0.71      0.83      2230
          1       0.27      0.89      0.42       270

avg / total       0.91      0.73      0.78      2500

Confusion matrix on test set:
 [[1592  638]
 [  30  240]]

** Results for UNDERSAMPLING with method Instance Hardness threshold
             precision    recall  f1-score   support

          0       0.91      0.99      0.95      2230
          1       0.79      0.22      0.35       270

avg / total       0.90      0.91      0.89      2500

Confusion matrix on test set:
 [[2214   16]
 [ 210   60]]

** Results for UNDERSAMPLING with method Condensed Nearest Neighbour
             precision    recall  f1-score   support

          0       0.93      0.97      0.95      2230
          1       0.66      0.40      0.49       270

avg / total       0.90      0.91      0.90      2500

Confusion matrix on test set:
 [[2174   56]
 [ 163

In [10]:
# Solution creating artificial examples of the smaller class
from imblearn import over_sampling as os

pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE), GaussianNB())

# Train the classifier with balancing
pipeline.fit(X_train, y_train)
# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
print(classification_report(y_test, y_pred_bal))
print("\n Confusion matrix on test set:\n",confusion_matrix(y_test, y_pred_bal))


             precision    recall  f1-score   support

          0       0.97      0.73      0.83      2230
          1       0.27      0.83      0.41       270

avg / total       0.90      0.74      0.79      2500


 Confusion matrix on test set:
 [[1630  600]
 [  46  224]]


## Final notes
There are some algorithm, for instance SVM, where you can introduce the weigth of each class in the calssification class. See the following example:

In [11]:
# Especial case for SVM

import matplotlib.pyplot as plt
from sklearn import svm

Cs = np.logspace(-2, 2, num=5, base=10.0)
print("Tested Cs", Cs)
param_grid = {'C': Cs}


# fit the model and get the separating hyperplane
grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=10)
grid_search.fit(X_train, y_train)
parval=grid_search.best_params_
print("Best C =", parval['C'])
clf = svm.SVC(kernel='rbf',C=parval['C'])
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print("\n** Results for Plain SVM")
print(classification_report(y_test, pred))
print("Confusion matrix on test set:\n",confusion_matrix(y_test, pred))
 

# fit the model and get the separating hyperplane using weighted classes
grid_search = GridSearchCV(svm.SVC(kernel='rbf', class_weight={1: 10}), param_grid, cv=10)
grid_search.fit(X_train, y_train)
parval=grid_search.best_params_
print("Best C =", parval['C'])
clf = svm.SVC(kernel='rbf',C=parval['C'], class_weight={1: 10})
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print("\n** Results for Plain SVM with ratio for class 1 set to 10")
print(classification_report(y_test, pred))
print("Confusion matrix on test set:\n",confusion_matrix(y_test, pred))
 

Tested Cs [  1.00000000e-02   1.00000000e-01   1.00000000e+00   1.00000000e+01
   1.00000000e+02]
Best C = 100.0

** Results for Plain SVM
             precision    recall  f1-score   support

          0       0.92      0.99      0.95      2230
          1       0.72      0.30      0.42       270

avg / total       0.90      0.91      0.89      2500

Confusion matrix on test set:
 [[2198   32]
 [ 189   81]]
Best C = 100.0

** Results for Plain SVM with ratio for class 1 set to 10
             precision    recall  f1-score   support

          0       0.96      0.83      0.89      2230
          1       0.34      0.73      0.46       270

avg / total       0.89      0.82      0.84      2500

Confusion matrix on test set:
 [[1844  386]
 [  74  196]]


Results with SVM are not very good because we haven't adjusted the gamma parameter, only C parameter is adjusted....  but the idea is there.