# Data Balancing - Undersampling class 0 and oversampling class 1

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import AllKNN

## 1. Load the Data

In [2]:
X_train = pd.read_csv('../Data/X_train_total.csv', index_col=0, header=0)
y_train = pd.read_csv('../Data/y_train_total.csv', index_col=0, header=0)
X_train.shape, y_train.shape

((92330, 23), (92330, 1))

## 2. Undersampling

In [8]:
X = X_train.values

In [9]:
y = y_train['Class']

In [10]:
allknn = AllKNN(sampling_strategy='majority', n_neighbors=669, n_jobs=-1)#era 681
X_resampled, y_resampled = allknn.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 933), (1, 162)]


In [11]:
X_resampled.shape

(1095, 23)

In [12]:
y_resampled.shape

(1095,)

## 3. Oversampling

In [13]:
from imblearn.over_sampling import ADASYN

In [15]:
adasyn = ADASYN(random_state=42)
X_resampled_total, y_resampled_total = adasyn.fit_resample(X_resampled, y_resampled)
print('Resampled dataset shape %s' % Counter(y_resampled_total))

Resampled dataset shape Counter({1: 936, 0: 933})


## 4. Dataset merge

In [16]:
total_patients = pd.DataFrame(X_resampled_total)
total_patients["Class"] = y_resampled_total

In [22]:
columns_names = ['sex', 'age', 'weight', 'height', 'waist', 'HIPX', 'menopause', 'HRT',
       'smoking', 'ReumatoidArthritis', 'SecondaryOsteoporsis', 'Alcohol',
       'Alcohol24', 'VitaminD', 'calcium', 'dose_walk', 'dose_moderate',
       'dose_vigorous', 'dose_pleasure', 'dose_sport', 'dose_execise',
       'dose_lightDIY', 'dose_heavyDIY', 'Class']

In [23]:
total_patients.columns = columns_names

In [25]:
X_train_resampled = total_patients.iloc[:,:total_patients.shape[1]-1]
X_train_resampled

Unnamed: 0,sex,age,weight,height,waist,HIPX,menopause,HRT,smoking,ReumatoidArthritis,...,VitaminD,calcium,dose_walk,dose_moderate,dose_vigorous,dose_pleasure,dose_sport,dose_execise,dose_lightDIY,dose_heavyDIY
0,-1.029482,-1.709681,-1.402355,-1.656273,-1.090778,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,-0.435723,-0.394950,0.475393,0.732836,0.756979,-1.169938,-0.359069,0.755752,-0.789813,-0.654840
1,-1.029482,-1.461373,-1.606375,-0.785922,-1.846625,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,1.326867,0.177845,-0.103454,0.284575,0.571847,0.403235,2.035280,1.160955,-0.789813,-0.654840
2,-1.029482,-1.585527,-1.153706,-1.003510,-1.015193,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,...,-0.602552,-0.489876,-0.326087,-0.562138,-0.538945,0.927627,-0.359069,2.376563,-0.789813,-0.654840
3,-1.029482,-1.337219,-1.351350,-0.785922,-1.771040,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,0.365784,-0.168972,-0.392877,0.135155,0.386715,0.927627,2.035280,0.755752,0.211232,-0.654840
4,-1.029482,-1.088910,-1.242965,-0.568335,-1.241947,-0.037838,-0.695973,-0.423252,0.760768,-0.08895,...,0.804618,-0.221428,-0.259297,0.433996,-0.489577,-0.121156,-0.359069,1.836293,0.461493,-0.369569
5,-1.029482,-1.213064,-1.058072,-1.003510,-1.241947,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,-1.015999,-0.488865,-0.559852,-0.362912,2.052902,-0.383351,-0.359069,2.376563,0.336362,-0.654840
6,-1.029482,-1.213064,-1.204711,-1.547479,-0.788439,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,-0.740368,0.402028,1.276873,0.882256,-0.168681,1.976409,1.237163,1.296022,-0.789813,-0.654840
7,-1.029482,-0.964756,-1.211087,-1.438686,-1.619871,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,-0.297907,0.461461,1.143293,0.433996,-0.261247,1.189822,-0.359069,1.160955,-0.414421,0.058338
8,-1.029482,-0.840602,-1.593623,-1.221098,-1.241947,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,-0.689594,-1.176052,-0.259297,-0.462525,1.127242,0.403235,-0.359069,1.296022,1.087145,-0.654840
9,-1.029482,-1.088910,-1.606375,-1.112304,-1.771040,-0.037838,-0.695973,-0.423252,-0.808640,-0.08895,...,-0.504631,0.264100,0.208233,-0.088975,0.571847,1.189822,-0.359069,1.566158,-0.789813,-0.654840


In [28]:
y_train_resampled = pd.DataFrame(total_patients['Class'], columns=['Class'])
y_train_resampled

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


## 5. Save to file

In [29]:
total_patients.to_csv('../Data/total_patients_trainset_balanced.csv')
X_train_resampled.to_csv('../Data/X_train.csv')
y_train_resampled.to_csv('../Data/y_train.csv')