Dummy classifiers are used to set baseline performance. Different options are available to set the [dummy classifier](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) using the sklearn API

In [1]:
from sklearn.datasets import make_classification

random_state=25
x,y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0,
                          n_classes=2, n_clusters_per_class=2,class_sep=2,flip_y=0,weights=[0.9,0.1],
                          random_state=random_state)

In [2]:
import pandas as pd

df = pd.DataFrame(dict(x=x[:,0], y=x[:,1], label=y))
print("Total number of examples \n",df.label.value_counts())

Total number of examples 
 0    900
1    100
Name: label, dtype: int64


In [3]:
from sklearn.model_selection import train_test_split

# setting up testing and training sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=random_state)

### Most_frequent -> Predict the most frequently occuring value of target variable

In [4]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="most_frequent",random_state=random_state)
dummy_classifier.fit( x_train,y_train )

DummyClassifier(constant=None, random_state=25, strategy='most_frequent')

In [5]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,dummy_classifier.predict(x_test))

0.892

### Stratified -> Predict according to the probability distribution in the train data. <br>
For example, if the train data contains 90% one variable, the model predicts 9/10 times that varible and 1/10 times the other variable randomly

In [6]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="stratified",random_state=random_state)
dummy_classifier.fit( x_train,y_train )

DummyClassifier(constant=None, random_state=25, strategy='stratified')

In [7]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,dummy_classifier.predict(x_test))

0.82

### Uniform -> Predict the target variable uniformly randomly

In [8]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="uniform",random_state=random_state)
dummy_classifier.fit( x_train,y_train )

DummyClassifier(constant=None, random_state=25, strategy='uniform')

In [9]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,dummy_classifier.predict(x_test))

0.508

### Constant -> Predict the target variable as the given constant <br>
Useful while trying to maximize recall, f1 score, etc

In [10]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="constant",random_state=random_state, constant=1)
dummy_classifier.fit( x_train,y_train )

DummyClassifier(constant=1, random_state=25, strategy='constant')

In [11]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,dummy_classifier.predict(x_test))

0.108

In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test,dummy_classifier.predict(x_test)))
# the warning is due to the fact that the label '0' is not found in the predictions by the model

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       223
           1       0.11      1.00      0.19        27

   micro avg       0.11      0.11      0.11       250
   macro avg       0.05      0.50      0.10       250
weighted avg       0.01      0.11      0.02       250



  'precision', 'predicted', average, warn_for)
