# Data Balancing - Random undersampling class 0 and oversampling class 1

In [20]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

## 1. Load the Data

In [2]:
starting_ratio = ''#'starting_ratio_05/'
percent_ratio = ''# '10_percent/'
base_data_folder = '../../Data/male_patients/'+starting_ratio+percent_ratio

In [3]:
X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
X_train.shape, y_train.shape

((47505, 13), (47505, 1))

## 2. Undersampling

In [4]:
ratio = 1

In [5]:
y_train[y_train['Class'] == 1].shape

(79, 1)

In [6]:
rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0: 79, 1: 79})


In [7]:
X_resampled.shape

(158, 13)

In [8]:
y_resampled.shape

(158,)

## 3. Oversampling

In [9]:
from imblearn.over_sampling import ADASYN

In [10]:
adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)
print('Resampled dataset shape %s' % Counter(y_resampled_total))

Resampled dataset shape Counter({0: 79, 1: 79})


In [11]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

## 4. Dataset merge and Data preparation for disk writing

In [12]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

In [13]:
columns_names = ['age', 'weight', 'height', 'HIPX', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']

In [14]:
total_patients.columns = columns_names

In [15]:
total_patients = total_patients.sample(frac=1).reset_index(drop=True)

In [16]:
X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]
X_train_resampled

Unnamed: 0,age,weight,height,HIPX,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous
0,1.021712,0.293558,1.825131,-0.037838,-0.808640,-0.08895,-0.134216,0.638729,1.120144,-0.099982,-0.192507,-0.462525,-0.538945
1,-1.088910,0.013031,0.030032,-0.037838,0.760768,-0.08895,-0.134216,0.626828,-0.591672,-0.337893,-0.682301,-0.574590,-0.538945
2,-0.343985,0.503954,0.302016,-0.037838,-0.808640,-0.08895,-0.134216,-0.341796,0.859019,-0.151250,0.275023,-0.163685,-0.353813
3,-1.709681,1.186144,0.519604,-0.037838,-0.808640,-0.08895,-0.134216,-0.540464,0.978701,-0.295549,0.208233,2.376457,-0.415524
4,0.400940,1.568681,1.389955,-0.037838,0.760768,-0.08895,-0.134216,-0.788571,-0.457483,1.440101,-0.259297,-0.313105,-0.168681
5,1.642483,-0.363130,0.193223,-0.037838,-0.808640,-0.08895,-0.134216,-0.788571,1.352254,0.450187,0.074653,1.479936,-0.477234
6,-1.833836,0.427446,0.845986,-0.037838,2.330176,-0.08895,-0.134216,-0.788571,-0.566285,-0.760676,-0.392877,-0.611945,-0.538945
7,-1.833836,-0.943311,0.084429,-0.037838,0.760768,-0.08895,-0.134216,-0.193023,-0.704101,1.514880,-0.548721,0.060445,0.016451
8,-0.964756,0.548583,1.498749,-0.037838,0.760768,-0.08895,-0.134216,0.998072,2.204535,0.665446,0.208233,-0.537235,-0.446379
9,0.152632,0.899242,0.193223,-0.037838,0.760768,-0.08895,-0.134216,0.104979,-0.501004,0.060874,-0.615511,-0.362912,-0.168681


In [17]:
y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])
y_train_resampled

Unnamed: 0,Class
0,1
1,0
2,0
3,1
4,0
5,1
6,1
7,0
8,0
9,0


## 5. Save to file

In [18]:
ratio_folder = ''
if starting_ratio == '':
    if ratio < 1 :
        base_data_folder += 'starting_ratio_05/'
    else:
        base_data_folder += 'starting_ratio_1/'
else:
    if ratio < 1 :
        ratio_folder = 'ratio_05/'
    else:
        ratio_folder = 'ratio_1/'
        
total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')

In [23]:
for i in range (1,6):
    starting_ratio = '1'
    percent_ratio = str(i)+'0'
    base_data_folder = '../../Data/male_patients/starting_ratio_'+starting_ratio+'/'+percent_ratio+'_percent/'
    
    X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
    y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
    X_train.shape, y_train.shape
    
    for i in [0.5,1]:
        
        ratio = i

        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)

        adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
        X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)

        total_patients = pd.DataFrame(X_resampled_total)
        total_patients["Class"] = y_resampled_total

        columns_names = ['age', 'weight', 'height', 'HIPX', 'smoking',
           'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
           'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']
        total_patients.columns = columns_names
        total_patients = total_patients.sample(frac=1).reset_index(drop=True)

        X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]

        y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])

        if ratio < 1:
            ratio_folder = 'ratio_05/'
        else:
            ratio_folder = 'ratio_1/'
        total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
        X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
        y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')