# Data Balancing - Undersampling class 0 and oversampling class 1

In [5]:
from collections import Counter
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import pandas as pd

## 1. Load the Data

In [160]:
starting_ratio = '1'
percent_ratio = '10_percent/'#''
base_data_folder = '../../Data/all_patients/starting_ratio_'+starting_ratio+'/'+percent_ratio

In [161]:
X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
X_train.shape, y_train.shape

((92330, 16), (92330, 1))

## 2. Undersampling

In [162]:
ratio = 0.5

In [163]:
y_train[y_train['Class'] == 1].shape

(525, 1)

In [164]:
rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0: 525, 1: 525})


In [165]:
X_resampled.shape

(1050, 16)

In [166]:
y_resampled.shape

(1050,)

## 3. Oversampling

In [167]:
adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)
print('Resampled dataset shape %s' % Counter(y_resampled_total))

Resampled dataset shape Counter({0: 525, 1: 525})


In [168]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

## 4. Dataset merge and Data preparation for disk writing

In [169]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

In [170]:
columns_names = ['sex', 'age', 'weight', 'height', 'HIPX', 'menopause', 'HRT', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']

In [171]:
total_patients.columns = columns_names

In [172]:
total_patients = total_patients.sample(frac=1).reset_index(drop=True)

In [173]:
X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]
X_train_resampled

Unnamed: 0,sex,age,weight,height,HIPX,menopause,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous
0,-1.029482,-0.343985,-0.120857,-1.166701,-0.037838,1.436838,2.362657,0.760768,-0.088950,7.450683,-0.788571,-0.355935,-1.300353,0.208233,-0.611945,-0.538945
1,0.971362,-1.088910,-0.369506,0.737192,-0.037838,-0.695973,-0.423252,2.330176,-0.088950,-0.134216,-0.168304,-0.537271,0.088455,-0.415141,0.135155,1.312374
2,-1.029482,0.649249,-0.949686,-0.785922,-0.037838,1.436838,2.362657,0.760768,-0.088950,-0.134216,-0.239257,-0.044036,-0.559472,0.208233,-0.487428,0.571847
3,0.971362,-0.964756,0.331812,0.954780,-0.037838,-0.695973,-0.423252,0.760768,-0.088950,-0.134216,-0.206756,-0.714981,-0.396517,-0.493062,-0.088975,0.849545
4,0.971362,0.773403,0.707973,0.954780,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,0.430906,-0.359561,-0.220138,0.208233,0.135155,0.942110
5,-1.029482,0.400940,-1.612750,-1.547479,-0.037838,1.436838,-0.423252,0.760768,-0.088950,-0.134216,0.645138,-0.758502,-1.125845,1.143293,2.525877,-0.292102
6,-1.029482,0.773403,-0.382257,0.084429,-0.037838,1.436838,2.362657,0.760768,-0.088950,-0.134216,-0.237884,-0.442976,-0.907502,-0.025532,0.433996,-0.477234
7,-1.029482,0.152632,-0.847677,-0.568335,-0.037838,1.436838,-0.423252,-0.808640,-0.088950,7.450683,-0.788571,-0.551778,1.076777,-0.660037,2.525877,0.078161
8,0.971362,-0.592293,0.472076,-0.133159,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,-0.788571,2.142881,-0.730415,3.280573,-0.313105,0.386715
9,0.971362,-0.840602,0.338188,1.063574,-0.037838,-0.695973,-0.423252,-0.808640,-0.088950,-0.134216,-0.431059,-0.450230,-0.373209,0.208233,-0.611945,0.571847


In [174]:
y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])
y_train_resampled

Unnamed: 0,Class
0,1
1,1
2,0
3,0
4,0
5,1
6,1
7,1
8,0
9,0


## 5. Save to file

In [175]:
if ratio < 1 :
    ratio_folder = 'ratio_05/'
else:
    ratio_folder = 'ratio_1/'
total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')

In [None]:
'''
for i in range (1,6):
    starting_ratio = '1'
    percent_ratio = str(i)+'0'
    base_data_folder = '../../Data/all_patients/starting_ratio_'+starting_ratio+'/'+percent_ratio+'_percent/'
    
    X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
    y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
    X_train.shape, y_train.shape
    
    for i in [0.5,1]:
        
        ratio = i

        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)

        adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
        X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)

        total_patients = pd.DataFrame(X_resampled_total)
        total_patients["Class"] = y_resampled_total

        columns_names = ['sex', 'age', 'weight', 'height', 'HIPX', 'menopause', 'HRT', 'smoking',
           'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
           'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']
        total_patients.columns = columns_names
        total_patients = total_patients.sample(frac=1).reset_index(drop=True)

        X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]

        y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])

        if ratio < 1:
            ratio_folder = 'ratio_05/'
        else:
            ratio_folder = 'ratio_1/'
        total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
        X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
        y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')
'''