# Data Balancing - Random undersampling class 0 and oversampling class 1

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

## 1. Load the Data

In [24]:
starting_ratio = ''#'starting_ratio_05/'
percent_ratio = ''# '10_percent/'
base_data_folder = '../../Data/female_patients_no_menopause/'+starting_ratio+percent_ratio

In [25]:
X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
X_train.shape, y_train.shape

((14694, 13), (14694, 1))

## 2. Undersampling

In [45]:
ratio = 1

In [46]:
y_train[y_train['Class'] == 1].shape

(16, 1)

In [47]:
rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0: 16, 1: 16})


In [48]:
X_resampled.shape

(32, 13)

In [49]:
y_resampled.shape

(32,)

## 3. Oversampling

In [50]:
from imblearn.over_sampling import ADASYN

In [51]:
adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)
print('Resampled dataset shape %s' % Counter(y_resampled_total))

Resampled dataset shape Counter({0: 16, 1: 16})


In [52]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

## 4. Dataset merge and Data preparation for disk writing

In [53]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

In [54]:
columns_names = ['age', 'weight', 'height', 'HIPX', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']

In [55]:
total_patients.columns = columns_names

In [56]:
total_patients = total_patients.sample(frac=1).reset_index(drop=True)

In [57]:
X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]
X_train_resampled

Unnamed: 0,age,weight,height,HIPX,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous
0,-1.08891,-0.018847,-0.568335,-0.037838,2.330176,-0.08895,-0.134216,-0.788571,0.888033,-0.953512,-0.281561,-0.611945,-0.538945
1,-0.468139,-0.898681,-1.329892,-0.037838,0.760768,-0.08895,-0.134216,0.237273,-0.131078,-0.986199,-0.259297,-0.238395,-0.415524
2,-1.709681,1.096886,-0.459541,-0.037838,0.760768,-0.08895,-0.134216,-0.465849,-0.308787,-0.084309,-0.459667,-0.462525,-0.415524
3,-1.709681,-0.446013,-0.024365,-0.037838,2.330176,-0.08895,-0.134216,0.930781,0.108287,-0.042546,0.675763,1.479936,0.849545
4,-1.08891,-1.682882,-1.112304,-0.037838,-0.80864,-0.08895,-0.134216,0.140227,-0.207239,-0.586825,-0.682301,-0.587042,-0.538945
5,-1.213064,-0.675535,-0.677128,-0.037838,2.330176,-0.08895,-0.134216,3.325332,-0.384949,0.658671,-0.259297,-0.487428,-0.538945
6,-0.964756,-0.560774,-1.00351,-0.037838,0.760768,-0.08895,-0.134216,-0.788571,-1.008746,-0.513134,0.208233,-0.462525,-0.168681
7,-0.964756,0.038534,-0.677128,-0.037838,-0.80864,-0.08895,-0.134216,0.747677,2.984282,-0.404177,-0.415141,-0.362912,0.294149
8,-1.709681,-1.045321,-0.024365,-0.037838,0.760768,-0.08895,-0.134216,-0.2919,0.006738,-0.07814,0.208233,1.479936,-0.106971
9,-0.592293,-1.950658,-2.309037,-0.037838,0.760768,-0.08895,-0.134216,-0.071717,-0.914451,1.239176,-0.548721,-0.512332,-0.415524


In [58]:
y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])
y_train_resampled

Unnamed: 0,Class
0,1
1,0
2,0
3,1
4,0
5,1
6,0
7,0
8,1
9,0


## 5. Save to file

In [59]:
if starting_ratio == '':
    if ratio < 1 :
        base_data_folder += 'starting_ratio_05/'
    else:
        base_data_folder += 'starting_ratio_1/'
else:
    if ratio < 1 :
        ratio_folder = 'ratio_05/'
    else:
        ratio_folder = 'ratio_1/'
        
total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')

In [62]:
'''
for i in range (1,6):
    starting_ratio = '05'
    percent_ratio = str(i)+'0'
    base_data_folder = '../../Data/female_patients_no_menopause/starting_ratio_'+starting_ratio+'/'+percent_ratio+'_percent/'
    
    X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
    y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
    X_train.shape, y_train.shape
    
    for i in [0.5,1]:
        
        ratio = i

        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)

        adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
        X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)

        total_patients = pd.DataFrame(X_resampled_total)
        total_patients["Class"] = y_resampled_total

        columns_names = ['age', 'weight', 'height', 'HIPX', 'smoking',
           'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
           'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']
        total_patients.columns = columns_names
        total_patients = total_patients.sample(frac=1).reset_index(drop=True)

        X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]

        y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])

        if ratio < 1:
            ratio_folder = 'ratio_05/'
        else:
            ratio_folder = 'ratio_1/'
        total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
        X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
        y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')
'''