In [18]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import ClusterCentroids

from IPython.display import HTML

print(os.listdir("../input"))

['training_set.csv', 'test_set.csv', 'data_note.pdf', 'test_set_metadata.csv', 'sample_submission.csv', 'test_set_sample.csv', 'training_set_metadata.csv']


In [2]:
def get_data(path2data, path2metada, isTest = False):
    aux = pd.read_csv(path2data)
    aux_metadata = pd.read_csv(path2metada)        
    df = pd.merge(aux, aux_metadata, on=['object_id', 'object_id'])
    y = None
    if not isTest:
        X = df.loc[:, df.columns != 'target']
        y = df.loc[:, df.columns == 'target']
    return X, y

In [5]:
X, y = get_data("../input/training_set.csv", "../input/training_set_metadata.csv")

In [6]:
X.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017
1,615,59750.4306,1,-816.434326,5.55337,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017
3,615,59750.445,4,-388.984985,11.395031,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017
4,615,59752.407,2,-681.858887,4.041204,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017


In [7]:
print('X shape {} y shape {}'.format(X.shape, y.shape))

X shape (1421705, 16) y shape (1421705, 1)


In [8]:
X.isnull().sum()

object_id                  0
mjd                        0
passband                   0
flux                       0
flux_err                   0
detected                   0
ra                         0
decl                       0
gal_l                      0
gal_b                      0
ddf                        0
hostgal_specz              0
hostgal_photoz             0
hostgal_photoz_err         0
distmod               400574
mwebv                      0
dtype: int64

In [9]:
X = X.drop(['distmod'], axis=1)
y = y['target']

**SMOTE (Synthetic Minority Oversampling Technique)**

In [17]:
HTML('<iframe width="760" height="315" src="https://www.youtube.com/embed/FheTDyCwRdE?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

In [10]:
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)

**SMOTE and Tomek techniques (Oversampling and Undersampling)**

In [None]:
smt = SMOTETomek(ratio='auto')
X_smt, y_smt = smt.fit_sample(X, y)

In [None]:
pd.Series(y).value_counts().index

In [None]:
plt.figure(figsize=(20,15))
plt.subplot(2,2,(1,2))
y.value_counts().plot(kind="bar")

plt.subplot(2,2,3)
pd.Series(y_sm).value_counts().plot(kind="bar")
plt.title('SMOTE', fontsize=16)

plt.subplot(2,2,4)
pd.Series(y_smt).value_counts().plot(kind="bar")
plt.title('SMOTETomek', fontsize=16)

**References**
- https://www3.nd.edu/~dial/publications/hoens2013imbalanced.pdf
- https://arxiv.org/pdf/1106.1813.pdf
- Documentation https://imbalanced-learn.org/en/stable/index.html
 