# Data Balancing - Random undersampling class 0 and oversampling class 1

In [1]:
from collections import Counter
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import pandas as pd

## 1. Load the Data

In [19]:
starting_ratio = ''#'starting_ratio_05/'
percent_ratio = ''# '10_percent/'
base_data_folder = '../../Data/other_patients/'+starting_ratio+percent_ratio

In [20]:
X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
X_train.shape, y_train.shape

((62201, 14), (62201, 1))

## 2. Undersampling

In [21]:
ratio = 1

In [22]:
y_train[y_train['Class'] == 1].shape

(94, 1)

In [23]:
rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0: 94, 1: 94})


In [24]:
X_resampled.shape

(188, 14)

In [25]:
y_resampled.shape

(188,)

## 3. Oversampling

In [26]:
adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)
print('Resampled dataset shape %s' % Counter(y_resampled_total))

Resampled dataset shape Counter({0: 94, 1: 94})


In [27]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

## 4. Dataset merge and Data preparation for disk writing

In [28]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

In [29]:
columns_names = ['sex', 'age', 'weight', 'height', 'HIPX', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']

In [30]:
total_patients.columns = columns_names

In [31]:
total_patients = total_patients.sample(frac=1).reset_index(drop=True)

In [32]:
X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]
X_train_resampled

Unnamed: 0,sex,age,weight,height,HIPX,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous
0,0.971362,-1.709681,-0.854052,-0.024365,-0.037838,-0.808640,-0.08895,-0.134216,-0.193023,-0.239879,-0.374245,-0.593247,-0.562138,-0.230392
1,-1.029482,-1.957990,-0.458764,0.084429,-0.037838,-0.808640,-0.08895,-0.134216,-0.788571,-0.863677,-0.382562,-0.392877,-0.611945,-0.538945
2,0.971362,1.394174,-0.235618,-0.241953,-0.037838,0.760768,-0.08895,-0.134216,-0.788571,1.062116,-0.968755,-0.504194,0.433996,1.404940
3,0.971362,-0.095676,1.613310,1.172367,-0.037838,0.760768,-0.08895,-0.134216,4.155254,-0.511884,-0.957885,-0.548721,-0.611945,-0.538945
4,0.971362,1.270020,-0.739291,0.954780,-0.037838,-0.808640,-0.08895,-0.134216,-0.513914,1.892637,1.939660,1.143293,0.172510,0.325004
5,0.971362,0.897557,0.070412,1.389955,-0.037838,2.330176,-0.08895,-0.134216,3.144058,0.046632,2.522314,-0.392877,-0.562138,-0.538945
6,0.971362,0.028478,2.888433,1.172367,-0.037838,0.760768,-0.08895,-0.134216,-0.785367,0.064766,0.140077,-0.548721,1.479936,-0.446379
7,0.971362,-0.840602,0.873739,0.845986,-0.037838,-0.808640,-0.08895,-0.134216,-0.614622,-0.653326,0.557070,-0.259297,0.135155,0.201583
8,0.971362,-0.343985,0.542207,0.845986,-0.037838,0.760768,-0.08895,-0.134216,0.104979,-0.254386,-0.485932,-0.660037,-0.387815,1.682638
9,0.971362,1.270020,-0.025222,-0.133159,-0.037838,-0.808640,-0.08895,-0.134216,-0.605467,-0.417589,1.356955,-0.103454,0.184962,0.016451


In [33]:
y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])
y_train_resampled

Unnamed: 0,Class
0,1
1,0
2,1
3,0
4,1
5,1
6,1
7,0
8,1
9,0


## 5. Save to file

In [34]:
ratio_folder = ''
if starting_ratio == '':
    if ratio < 1 :
        base_data_folder += 'starting_ratio_05/'
    else:
        base_data_folder += 'starting_ratio_1/'
else:
    if ratio < 1 :
        ratio_folder = 'ratio_05/'
    else:
        ratio_folder = 'ratio_1/'
        
total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')

In [2]:
for j in range (1,6):
    print(j)
    starting_ratio = '05'
    percent_ratio = str(j)+'0'
    base_data_folder = '../../Data/other_patients/starting_ratio_'+starting_ratio+'/'+percent_ratio+'_percent/'
    
    X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
    y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
    X_train.shape, y_train.shape
    
    for i in [0.5,1]:
        
        ratio = i

        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)

        adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
        X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)

        total_patients = pd.DataFrame(X_resampled_total)
        total_patients["Class"] = y_resampled_total

        columns_names = ['sex', 'age', 'weight', 'height', 'HIPX', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']
        total_patients.columns = columns_names
        total_patients = total_patients.sample(frac=1).reset_index(drop=True)

        X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]

        y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])

        if ratio < 1:
            ratio_folder = 'ratio_05/'
        else:
            ratio_folder = 'ratio_1/'
        total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
        X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
        y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')

1
2
3
4
5
