# Data Balancing - Random undersampling class 0 and oversampling class 1

In [5]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler

## 1. Load the Data

In [34]:
starting_ratio = ''#'starting_ratio_05/'
percent_ratio = ''# '10_percent/'
base_data_folder = '../../Data/women_menopause/'+starting_ratio+percent_ratio

In [35]:
X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
X_train.shape, y_train.shape

((30129, 14), (30129, 1))

## 2. Undersampling

In [36]:
ratio = 1

In [37]:
y_train[y_train['Class'] == 1].shape

(64, 1)

In [38]:
rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0: 64, 1: 64})


In [39]:
X_resampled.shape

(128, 14)

In [40]:
y_resampled.shape

(128,)

## 3. Oversampling

In [41]:
from imblearn.over_sampling import ADASYN

In [42]:
adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)
print('Resampled dataset shape %s' % Counter(y_resampled_total))

Resampled dataset shape Counter({0: 64, 1: 64})


In [43]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

## 4. Dataset merge and Data preparation for disk writing

In [44]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

In [45]:
columns_names = ['age', 'weight', 'height', 'HIPX', 'HRT', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']

In [46]:
total_patients.columns = columns_names

In [47]:
total_patients = total_patients.sample(frac=1).reset_index(drop=True)

In [48]:
X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]
X_train_resampled

Unnamed: 0,age,weight,height,HIPX,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous
0,0.400940,-1.478862,-2.200243,-0.037838,2.362657,0.760768,-0.088950,7.450683,-0.788571,-0.381322,1.279422,1.610823,2.525877,0.571847
1,0.152632,-0.713789,-0.894716,-0.037838,-0.423252,2.330176,-0.088950,-0.134216,-0.788571,-0.682340,0.898351,-0.392877,-0.587042,-0.538945
2,1.145866,-0.771169,0.084429,-0.037838,2.362657,0.760768,-0.088950,-0.134216,0.104979,-0.664207,-1.277071,0.441998,0.284575,-0.076115
3,0.773403,0.605963,-0.677128,-0.037838,2.362657,-0.808640,-0.088950,7.450683,-0.091858,-0.725861,-0.484946,-0.559852,-0.611945,-0.538945
4,0.525095,-0.681911,-1.438686,26.428635,-0.423252,-0.808640,-0.088950,-0.134216,-0.788571,-0.885437,-1.607430,-0.326087,2.525877,-0.353813
5,0.400940,-0.477891,-0.024365,-0.037838,-0.423252,0.760768,-0.088950,-0.134216,-0.788571,-0.432096,-0.236798,1.143293,-0.088975,0.201583
6,0.400940,-0.095354,-1.221098,-0.037838,-0.423252,-0.808640,-0.088950,-0.134216,-0.276336,-0.950718,-1.466241,-0.415141,-0.499880,-0.538945
7,0.276786,0.095914,-1.221098,-0.037838,2.362657,0.760768,11.242216,-0.134216,0.502316,-0.986985,1.418260,-0.726827,-0.537235,-0.538945
8,0.525095,-1.268467,0.084429,-0.037838,2.362657,-0.808640,-0.088950,-0.134216,-0.463560,0.336770,0.103294,-0.025532,-0.362912,0.571847
9,0.400940,-1.612750,-1.547479,-0.037838,-0.423252,0.760768,-0.088950,-0.134216,0.645138,-0.758502,-1.125845,1.143293,2.525877,-0.292102


In [49]:
y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])
y_train_resampled

Unnamed: 0,Class
0,1
1,1
2,1
3,1
4,1
5,1
6,0
7,0
8,0
9,1


## 5. Save to file

In [50]:
ratio_folder = ''
if starting_ratio == '':
    if ratio < 1 :
        base_data_folder += 'starting_ratio_05/'
    else:
        base_data_folder += 'starting_ratio_1/'
else:
    if ratio < 1 :
        ratio_folder = 'ratio_05/'
    else:
        ratio_folder = 'ratio_1/'
        
total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')

In [7]:
for j in range (1,6):
    print(j)
    starting_ratio = '1'
    percent_ratio = str(j)+'0'
    base_data_folder = '../../Data/women_menopause/starting_ratio_'+starting_ratio+'/'+percent_ratio+'_percent/'
    
    X_train = pd.read_csv(base_data_folder+'X_train_total.csv', index_col=0, header=0)
    y_train = pd.read_csv(base_data_folder+'y_train_total.csv', index_col=0, header=0)
    X_train.shape, y_train.shape
    
    for i in [0.5,1]:
        
        ratio = i

        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)

        adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
        X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)

        total_patients = pd.DataFrame(X_resampled_total)
        total_patients["Class"] = y_resampled_total

        columns_names = ['age', 'weight', 'height', 'HIPX', 'HRT', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']
        total_patients.columns = columns_names
        total_patients = total_patients.sample(frac=1).reset_index(drop=True)

        X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]

        y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])

        if ratio < 1:
            ratio_folder = 'ratio_05/'
        else:
            ratio_folder = 'ratio_1/'
        total_patients.to_csv(base_data_folder+ratio_folder+'total_patients_trainset_balanced.csv')
        X_train_resampled.to_csv(base_data_folder+ratio_folder+'X_train.csv')
        y_train_resampled.to_csv(base_data_folder+ratio_folder+'y_train.csv')

1
2
3
4
5
