# Upsampling and Downsampling

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_classification
from sklearn.utils import resample

In [3]:
X, y = make_classification(n_samples=1000, n_classes=2, n_features=5, weights=[0.3,0.7])

In [4]:
X

array([[ 1.09904447e+00,  1.42784558e-02,  4.13266974e-01,
         1.34395220e+00,  5.90586710e-01],
       [ 7.70759572e-01, -1.13435423e+00, -4.07866812e-04,
         9.85860716e-01,  1.26690002e-01],
       [ 1.65603661e-01,  2.51911522e-01,  1.03270671e+00,
         5.75673770e-02,  1.05025157e+00],
       ...,
       [-6.75634239e-01,  1.44236694e+00, -1.91230562e-01,
        -8.35573572e-01, -3.00831211e-01],
       [ 1.20546623e+00, -9.46675685e-01, -1.21641628e+00,
         1.72346585e+00, -1.00614263e+00],
       [-2.00011861e+00, -2.80610289e-01,  1.63991080e+00,
        -2.80307528e+00,  1.29460002e+00]])

In [5]:
y

array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,

In [9]:
df = pd.DataFrame(X, columns=[f"col_{i+1}" for i in range(X.shape[1])])

In [10]:
df["target"] = y

In [11]:
df.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,target
0,1.099044,0.014278,0.413267,1.343952,0.590587,1
1,0.77076,-1.134354,-0.000408,0.985861,0.12669,0
2,0.165604,0.251912,1.032707,0.057567,1.050252,0
3,1.933004,0.246654,-2.522761,2.849093,-2.180171,1
4,1.127747,1.275576,-1.531355,1.6711,-1.33092,1


In [12]:
df["target"].value_counts()

target
1    696
0    304
Name: count, dtype: int64

In [13]:
zero_class = df[df["target"]==0]
one_class = df[df["target"]==1]

In [14]:
zero_class.shape, one_class.shape

((304, 6), (696, 6))

# Upsampling

In [15]:
upsample_zero = resample(zero_class, n_samples=one_class.shape[0], replace=True)

In [16]:
upsample_zero.shape

(696, 6)

In [17]:
upsampled_df = pd.concat([one_class, upsample_zero])

In [18]:
upsampled_df.shape

(1392, 6)

# Downsampling

In [20]:
downsample_one = resample(one_class, n_samples=zero_class.shape[0], replace=False)

In [21]:
downsample_one.shape

(304, 6)

In [22]:
downsample_df = pd.concat([zero_class, downsample_one])

In [23]:
downsample_df.shape

(608, 6)

# Stacking Technique with Upsampling and Downsampling

In [24]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [25]:
base_models = [
    ("rf", RandomForestClassifier(n_estimators=30, max_depth=4)),
    ("svm",SVC()),
    ("knn", KNeighborsClassifier(n_neighbors=11))
]

In [26]:
meta_model = LogisticRegression()

In [27]:
stack_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1, cv=10)

In [28]:
stack_classifier.fit(X, y)

In [29]:
stack_classifier.score(X, y)

0.943