# Data Balancing - Random undersampling class 0 and oversampling class 1

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

## 1. Load the Data

In [2]:
ratio_folder = "ratio05/"
#Leave empty for the first run

In [3]:
X_train = pd.read_csv('../../Data/women_menopause/'+ratio_folder+'X_train_total.csv', index_col=0, header=0)
y_train = pd.read_csv('../../Data/women_menopause/'+ratio_folder+'y_train_total.csv', index_col=0, header=0)
X_train.shape, y_train.shape

((30129, 14), (30129, 1))

## 2. Undersampling

In [4]:
ratio = 1

In [5]:
y_train[y_train['Class'] == 1].shape

(1977, 1)

In [6]:
rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train['Class'].values)
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0: 1977, 1: 1977})


In [7]:
X_resampled.shape

(3954, 14)

In [8]:
y_resampled.shape

(3954,)

## 3. Oversampling

In [9]:
from imblearn.over_sampling import ADASYN

In [10]:
adasyn = ADASYN(random_state=42, sampling_strategy=ratio)
X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)
print('Resampled dataset shape %s' % Counter(y_resampled_total))

Resampled dataset shape Counter({0: 1977, 1: 1977})


In [11]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

## 4. Dataset merge and Data preparation for disk writing

In [12]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

In [13]:
columns_names = ['age', 'weight', 'height', 'HIPX', 'HRT', 'smoking',
       'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol', 'VitaminD',
       'calcium', 'dose_walk', 'dose_moderate', 'dose_vigorous','Class']

In [14]:
total_patients.columns = columns_names

In [15]:
total_patients = total_patients.sample(frac=1).reset_index(drop=True)

In [16]:
X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]
X_train_resampled

Unnamed: 0,age,weight,height,HIPX,HRT,smoking,ReumatoidArthritis,SecondaryOsteoporsis,Alcohol,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous
0,-1.213064,0.044909,-0.350747,-0.037838,-0.423252,2.330176,-0.088950,-0.134216,-0.788571,-0.094810,-1.020327,-0.259297,-0.487428,-0.168681
1,0.897557,-0.490642,-0.241953,-0.037838,-0.423252,-0.808640,-0.088950,-0.134216,-0.788571,-0.878184,1.088254,2.078353,0.583416,5.015013
2,1.642483,-1.982536,-1.982655,-0.037838,-0.423252,-0.808640,-0.088950,-0.134216,-0.788571,-0.239879,1.288851,-0.459667,-0.611945,-0.538945
3,0.897557,-0.509769,0.628398,-0.037838,-0.423252,-0.808640,-0.088950,-0.134216,-0.788571,-0.399455,-0.083475,0.052389,1.181096,-0.353813
4,-0.219831,-1.249340,0.193223,-0.037838,2.362657,-0.808640,-0.088950,-0.134216,-0.373381,-0.363188,-0.103446,-0.225902,-0.238395,0.016451
5,0.525095,0.911993,-1.438686,-0.037838,2.362657,0.760768,-0.088950,-0.134216,-0.788571,-0.131078,0.355943,-0.660037,-0.611945,-0.538945
6,0.525095,-0.752042,-1.112304,-0.037838,2.362657,0.760768,-0.088950,-0.134216,-0.341796,0.692190,-0.515434,0.208233,1.479936,4.644749
7,1.021712,0.178797,-1.003510,-0.037838,2.362657,0.760768,-0.088950,-0.134216,0.749508,-0.889064,2.524210,-0.660037,-0.350460,-0.538945
8,0.649249,-0.713789,-1.438686,-0.037838,-0.423252,0.760768,-0.088950,-0.134216,0.489957,-0.617059,-0.463686,-0.682301,-0.014265,-0.538945
9,1.394174,-1.332223,-1.003510,-0.037838,2.362657,-0.808640,-0.088950,-0.134216,1.518547,-0.330548,-0.789191,-0.259297,0.060445,-0.353813


In [17]:
y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])
y_train_resampled

Unnamed: 0,Class
0,1
1,0
2,1
3,0
4,0
5,1
6,1
7,0
8,0
9,0


## 5. Save to file

In [18]:
total_patients.to_csv('../../Data/women_menopause/'+ratio_folder+'1/total_patients_trainset_balanced.csv')
X_train_resampled.to_csv('../../Data/women_menopause/'+ratio_folder+'1/X_train.csv')
y_train_resampled.to_csv('../../Data/women_menopause/'+ratio_folder+'1/y_train.csv')