# Handling Data imbalance Classification Model 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [3]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

In [4]:
numerical.shape

(95412, 315)

In [5]:
categorical

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,89,1,37,12,92,8,94,2,95,12,89,11.0
1,CA,14,H,M,3,L,G,A,S,1,94,1,52,2,93,10,95,12,95,12,93,10.0
2,NC,43,U,M,3,L,E,C,R,2,90,1,0,2,91,11,92,7,95,12,90,1.0
3,CA,44,U,F,3,L,E,C,R,2,87,1,28,1,87,11,94,11,95,12,87,2.0
4,FL,16,H,F,3,L,F,A,S,2,86,1,20,1,93,10,96,1,96,1,79,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,other,27,H,M,3,L,G,C,C,2,96,1,0,2,96,2,96,2,96,2,96,2.0
95408,TX,24,H,M,3,L,F,A,C,1,96,1,50,1,96,3,96,3,96,3,96,3.0
95409,MI,30,H,M,3,L,E,B,C,3,95,1,38,1,96,3,95,1,96,10,94,10.0
95410,CA,24,H,F,2,L,F,A,C,1,86,1,40,5,90,11,96,8,97,1,86,12.0


In [6]:
targets['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [7]:
 # As we can see there is a huge imbalance in the data in the representation
 # of the two categories. Category 0 is represented 90569 times and category 1 is represented 4843 times.

In [8]:
90569/len(targets['TARGET_B'])

0.9492411855951033

In [9]:
data = pd.concat([numerical, targets], axis=1)

In [10]:
# Dropping target D as this would be the target later, after we predict who is more likely to donate
data = data.drop(['TARGET_D'], axis=1)
data.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B
0,0,60.0,5,9,0,0,39,34,18,10,2,1,5,992,264,332,0,35,65,47,53,92,1,0,0,11,0,0,0,0,0,0,0,11,0,0,0,39,48,51,40,50,54,25,31,42,27,11,14,18,17,13,11,15,12,11,34,25,18,26,10,23,18,33,49,28,12,4,61,7,12,19,198,276,97,95,2,2,0,0,7,7,0,479,635,3,2,86,14,96,4,7,38,80,70,32,84,16,6,2,5,9,15,3,17,50,25,0,0,0,2,7,13,27,47,0,1,61,58,61,15,4,2,0,0,14,1,0,0,2,5,17,73,0.0,177.0,682.0,307,318,349,378,12883,13,23,23,23,15,1,0,0,1,4,25,24,26,17,2,0,0,2,28,4,51,1,46,54,3,88,8,0,0,0,0,0,0,4,1,13,14,16,2,45,56,64,50,64,44,62,53,99,0,0,9,3,8,13,9,0,3,9,3,15,19,5,4,3,0,3,41,1,0,7,13,6,5,0,4,9,4,1,3,10,2,1,7,78,2,0,120,16,10,39,21,8,4,3,5,20,3,19,4,0,0,0,18,39,0,34,23,18,16,1,4,0,23,0,0,5,1,0,0,0,0,0,2,0,3,74,88,8,0,4,96,77,19,13,31,5,14,14,31,54,46,0,0,90,0,10,0,0,0,33,65,40,99,99,6,2,10,7,27,74,6,14,240.0,31,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39,0
1,1,46.0,6,9,16,0,15,55,11,6,2,1,9,3611,940,998,99,0,0,50,50,67,0,0,31,6,4,2,6,4,14,0,0,2,0,1,4,34,41,43,32,42,45,32,33,46,21,13,14,33,23,10,4,2,11,16,36,22,15,12,1,5,4,21,75,55,23,9,69,4,3,24,317,360,99,99,0,0,0,0,0,0,0,5468,5218,12,10,96,4,97,3,9,59,94,88,55,95,5,4,1,3,5,4,2,18,44,5,0,0,0,97,98,98,98,99,94,0,83,76,73,21,5,0,0,0,4,0,0,0,91,91,91,94,4480.0,13.0,803.0,1088,1096,1026,1037,36175,2,6,2,5,15,14,13,10,33,2,5,2,5,15,14,14,10,32,6,2,66,3,56,44,9,80,14,0,0,0,0,0,0,6,0,2,24,32,12,71,70,83,58,81,57,64,57,99,99,0,22,24,4,21,13,2,1,6,0,4,1,0,3,1,0,6,13,1,2,8,18,11,4,3,4,10,7,11,1,6,2,1,16,69,5,2,160,5,5,12,21,7,30,20,14,24,4,24,10,0,0,0,8,15,0,55,10,11,0,0,2,0,3,1,1,2,3,1,1,0,3,0,0,0,42,39,50,7,27,16,99,92,53,5,10,2,26,56,97,99,0,0,0,96,0,4,0,0,0,99,0,99,99,99,20,4,6,5,12,32,6,13,47.0,3,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1,0
2,1,61.611649,3,1,2,0,20,29,33,6,8,1,1,7001,2040,2669,0,2,98,49,51,96,2,0,0,2,0,0,0,0,0,0,0,2,0,0,0,35,43,46,37,45,49,23,35,40,25,13,20,19,16,13,10,8,15,14,30,22,19,25,10,23,21,35,44,22,6,2,63,9,9,19,183,254,69,69,1,6,5,3,3,3,0,497,546,2,1,78,22,93,7,18,36,76,65,30,86,14,7,2,5,11,17,3,17,60,18,0,1,0,0,1,6,18,50,0,4,36,49,51,14,5,4,2,24,11,2,3,6,0,2,9,44,0.0,281.0,518.0,251,292,292,340,11576,32,18,20,15,12,2,0,0,1,20,19,24,18,16,2,0,0,1,28,8,31,11,38,62,8,74,22,0,0,0,0,0,2,2,1,21,19,24,6,61,65,73,59,70,56,78,62,82,99,4,10,5,2,6,12,0,1,9,5,18,20,5,7,6,0,11,33,4,3,2,12,3,3,2,0,7,8,3,3,6,7,1,8,74,3,1,120,22,20,28,16,6,5,3,1,23,1,16,6,0,0,0,10,21,0,28,23,32,8,1,14,1,5,0,0,7,0,0,0,0,0,1,0,0,2,84,96,3,0,0,92,65,29,9,22,3,12,23,50,69,31,0,0,0,6,35,44,0,15,22,77,17,97,92,9,2,6,5,26,63,6,14,202.0,27,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60,0
3,0,70.0,1,4,2,0,23,14,31,3,0,3,0,640,160,219,0,8,92,54,46,61,0,0,11,32,6,2,0,0,0,0,0,31,0,0,1,32,40,44,34,43,47,25,45,35,20,15,25,17,17,12,7,7,20,17,30,14,19,25,11,23,23,27,50,30,15,8,63,9,6,23,199,283,85,83,3,4,1,0,2,0,2,1000,1263,2,1,48,52,93,7,6,36,73,61,30,84,16,6,3,3,21,12,4,13,36,13,0,0,0,10,25,50,69,92,10,15,42,55,50,15,5,4,0,9,42,4,0,5,1,8,17,34,9340.0,67.0,862.0,386,388,396,423,15130,27,12,4,26,22,5,0,0,4,35,5,6,12,30,6,0,0,5,22,14,26,20,46,54,3,58,36,0,0,0,0,0,6,0,0,17,13,15,0,43,69,81,53,68,45,33,31,0,99,23,17,3,0,6,6,0,0,13,42,12,0,0,0,42,0,6,3,0,0,0,23,3,3,6,0,3,3,3,3,3,0,3,6,87,0,0,120,28,12,14,27,10,3,5,0,19,1,17,0,0,0,0,13,23,0,14,40,31,16,0,1,0,13,0,0,4,0,0,0,3,0,0,0,0,29,67,56,41,3,0,94,43,27,4,38,0,10,19,39,45,55,0,0,45,22,17,0,0,16,23,77,22,93,89,16,2,6,6,27,66,6,14,109.0,16,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41,0
4,0,78.0,3,2,60,1,28,9,53,26,3,2,9,2520,627,761,99,0,0,46,54,2,98,0,0,1,0,0,0,0,0,0,0,0,0,0,0,33,45,50,36,46,50,27,34,43,23,14,21,13,15,20,12,5,13,15,34,19,19,31,7,27,16,26,57,36,24,14,42,17,9,33,235,323,99,98,0,0,0,0,0,0,0,576,594,4,3,90,10,97,3,0,42,82,49,22,92,8,20,3,17,9,23,1,1,1,0,21,58,19,0,1,2,16,67,0,2,45,52,53,16,6,0,0,0,9,0,0,0,25,58,74,83,5000.0,127.0,528.0,240,250,293,321,9836,24,29,23,13,4,4,0,0,2,21,30,22,16,4,5,0,0,3,35,8,11,14,20,80,4,73,22,1,1,0,0,0,3,1,2,1,24,27,3,76,61,73,51,65,49,80,31,81,99,10,17,8,2,6,15,3,7,22,2,9,0,7,2,2,0,6,1,5,2,2,12,2,7,6,4,15,29,4,3,26,3,2,7,49,12,1,120,16,20,30,13,3,12,5,2,26,1,20,7,1,1,1,15,28,4,9,16,53,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,65,99,0,0,0,90,45,18,25,34,0,1,3,6,33,67,0,0,9,14,72,3,0,0,99,1,21,99,96,6,2,7,11,43,113,10,25,254.0,37,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26,0


In [11]:
data.shape

(95412, 316)

## Downsampling

<b>In downsampling, we randomly sample without replacement from the majority class

In [12]:
category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

In [13]:
print(category_0.shape)
print(category_1.shape)

(90569, 316)
(4843, 316)


In [14]:
category_0_down = category_0.sample(len(category_1))
print(category_0_down.shape)
print(category_1.shape)

(4843, 316)
(4843, 316)


In [15]:
data = pd.concat([category_0_down, category_1], axis=0)
#shuffling the data
data = data.sample(frac=1)
data['TARGET_B'].value_counts()

1    4843
0    4843
Name: TARGET_B, dtype: int64

## Another Method: Upsampling Method 1

In [16]:
data = pd.concat([numerical, targets], axis=1)
data = data.drop(['TARGET_D'], axis=1)
category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

In [17]:
category_1_up = category_1.sample(len(category_0), replace=True)
print(category_1_up.shape)

(90569, 316)


In [18]:
category_1_up

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B
5320,2,85.000000,5,9,0,0,0,0,0,0,0,0,4,770,212,280,0,0,99,48,52,99,1,0,0,2,0,0,0,0,0,0,0,1,0,0,1,29,37,41,31,41,45,31,41,38,22,16,27,22,16,10,6,4,19,14,29,24,14,15,5,13,20,30,50,29,11,3,55,16,5,24,200,274,42,42,1,4,3,1,0,0,0,664,718,3,2,85,15,97,3,0,48,76,54,30,86,14,17,4,14,14,15,5,26,62,11,0,0,0,0,4,15,35,71,0,0,31,47,51,15,5,4,0,53,6,3,1,5,0,5,38,65,2160.0,57.0,505.0,315,308,336,332,11283,26,11,26,18,17,3,0,0,0,21,13,32,14,21,0,0,0,0,22,9,36,9,49,51,10,80,14,0,0,0,0,0,0,6,2,38,20,24,7,63,67,84,51,75,49,65,53,82,99,3,2,2,2,8,7,3,2,22,0,25,11,12,3,3,0,12,22,2,8,0,23,0,12,5,7,2,2,0,2,0,0,2,6,84,2,5,120,8,29,42,17,3,1,0,1,24,3,21,1,0,0,0,21,42,0,35,6,29,16,0,1,3,15,0,0,2,2,0,1,0,0,0,1,0,0,63,97,2,0,1,93,70,25,5,19,0,20,31,61,80,20,0,0,25,20,0,52,0,3,70,23,60,99,99,8,3,5,8,28,68,5,12,159.0,44,20,3.0,5.0,5.0,5,3.613636,66757,1,4,39,1
29851,0,62.000000,5,7,0,1,28,29,22,6,0,2,9,1407,395,450,99,0,0,50,50,99,0,0,1,3,0,0,0,0,0,0,0,2,0,0,1,32,40,43,33,43,46,27,37,41,23,12,25,21,17,16,7,3,15,15,35,19,16,15,3,12,11,30,59,41,16,4,66,6,4,24,251,312,90,89,0,9,9,9,0,0,0,1058,1090,6,5,98,2,99,0,0,42,88,79,39,95,5,3,1,3,6,12,2,22,66,11,0,0,0,1,4,60,94,99,0,0,77,60,63,17,5,0,9,0,2,0,0,0,50,75,75,88,1600.0,51.0,602.0,479,499,525,562,18116,8,9,12,28,30,7,0,1,6,5,7,10,28,35,7,0,1,7,30,1,62,4,36,64,2,83,5,9,1,8,0,0,1,3,1,13,20,32,14,58,76,85,67,80,65,66,68,99,0,0,13,15,4,17,22,0,2,6,1,10,2,6,2,2,0,6,16,5,1,14,19,6,2,0,2,11,5,11,1,6,0,2,5,79,9,0,120,4,5,41,22,10,13,4,5,18,2,16,6,1,1,0,13,28,0,29,32,22,6,5,0,0,15,0,0,8,6,0,15,0,0,1,2,0,2,86,98,0,0,2,99,87,18,10,24,0,10,19,35,81,19,13,0,99,0,0,0,0,0,94,6,94,95,99,8,3,10,8,10,23,6,13,35.0,4,2,5.0,10.0,10.0,10,8.750000,91507,0,3,5,1
10227,28,61.611649,5,9,0,9,48,37,25,2,8,24,9,5921,1621,2105,0,97,3,51,49,79,12,1,4,10,0,0,0,2,1,0,0,7,1,0,2,30,38,41,31,42,45,31,42,39,18,12,29,23,14,11,8,4,17,18,33,19,13,17,5,14,18,33,49,31,13,5,65,13,4,18,197,281,76,75,4,16,12,6,0,0,0,848,883,4,4,69,31,89,11,12,46,77,65,36,88,12,9,2,7,16,10,5,20,49,9,4,6,2,1,3,24,67,96,0,4,46,52,53,15,5,14,2,8,12,9,6,3,31,58,81,95,680.0,73.0,800.0,368,397,392,412,13741,17,12,17,24,24,4,1,0,0,13,11,17,29,25,4,1,0,0,18,9,24,8,77,23,20,70,25,0,0,0,0,1,2,2,1,11,25,28,8,72,69,81,57,76,51,59,48,67,84,7,12,12,5,7,17,0,3,10,1,21,4,4,5,1,4,7,12,7,1,1,13,5,7,2,2,3,5,5,25,4,3,29,4,58,2,0,120,6,12,35,27,9,8,4,2,30,1,21,9,5,7,2,32,58,6,46,13,15,28,1,9,3,15,0,0,12,2,1,2,0,0,1,1,0,7,39,87,7,3,3,96,67,21,2,8,25,48,53,78,98,2,0,2,40,37,20,0,0,4,99,1,51,99,94,10,3,7,5,5,13,4,10,15.0,1,1,15.0,15.0,15.0,9,15.000000,160610,0,1,16,1
6440,1,87.000000,5,6,2,0,26,24,37,10,0,0,9,786,216,330,99,0,0,47,53,84,0,1,4,25,0,1,1,0,1,0,0,20,0,0,5,33,41,45,37,45,48,23,49,33,18,12,26,18,11,14,13,7,23,21,29,14,13,29,13,26,32,30,39,22,7,1,55,15,8,22,162,238,45,43,0,55,55,40,0,0,0,2328,2364,5,6,40,60,97,3,11,31,65,52,23,82,18,8,2,6,13,27,3,15,49,20,0,0,0,74,92,99,99,99,11,8,35,49,48,13,5,31,24,0,6,15,39,0,78,93,96,96,5945.0,13.0,803.0,346,469,434,493,17417,17,19,15,11,22,11,4,0,1,6,27,7,11,27,15,5,0,2,20,8,46,4,52,48,19,78,8,5,5,0,0,2,5,3,0,42,30,33,12,68,69,81,60,77,59,70,37,99,99,10,13,22,2,6,12,0,2,13,0,14,8,5,1,0,0,16,19,5,3,6,16,6,7,2,3,3,6,4,5,10,0,0,7,79,4,0,120,7,11,32,21,8,18,3,4,22,3,18,6,0,0,0,12,26,1,24,36,37,4,0,0,0,3,0,0,5,0,1,2,0,0,2,0,0,20,47,66,23,11,0,92,45,16,6,32,0,0,0,0,13,87,0,0,71,0,25,0,0,3,99,0,99,99,96,11,2,3,13,13,30,6,12,100.0,4,3,10.0,50.0,20.0,1,25.000000,145905,1,1,19,1
41631,1,59.000000,7,8,13,0,45,24,60,4,4,3,9,510,160,225,0,0,99,50,50,99,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,45,51,52,45,52,53,16,34,48,18,4,13,21,17,17,19,10,13,15,37,26,9,34,7,31,20,55,26,14,3,1,69,11,5,14,156,224,87,87,0,0,0,0,0,0,0,2800,3078,3,2,80,20,71,29,79,19,71,64,15,79,21,4,1,3,18,14,9,14,53,32,0,0,0,62,80,89,95,98,47,10,38,51,49,13,3,0,0,11,15,0,0,4,21,35,56,68,0.0,105.0,819.0,368,438,500,528,25386,10,6,27,29,7,10,3,1,6,10,5,17,41,8,6,3,1,9,29,1,63,7,50,50,28,69,9,4,0,0,4,0,2,16,0,6,20,27,10,73,48,57,39,54,36,49,47,99,0,0,24,14,2,9,12,0,0,6,7,10,6,6,4,3,0,15,12,6,0,0,10,5,8,10,0,8,6,16,1,4,4,3,33,50,5,0,149,0,4,9,42,5,27,11,0,12,3,7,2,0,0,0,24,45,3,24,10,60,0,0,8,0,4,0,0,3,0,1,2,0,1,1,2,0,9,34,94,2,0,4,99,95,18,5,16,5,20,33,65,76,24,0,0,1,0,49,5,0,45,26,69,0,88,99,9,4,8,11,23,54,6,12,74.0,12,8,2.0,9.0,9.0,2,6.166667,180752,1,3,13,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56423,0,38.000000,4,3,1,0,0,0,0,0,0,0,3,1016,278,393,99,0,0,48,52,97,0,1,2,0,1,0,0,0,0,0,0,0,0,0,0,32,41,45,35,45,48,25,31,46,24,12,25,18,14,15,11,6,12,12,38,22,16,23,7,21,24,35,41,25,10,3,60,9,6,26,175,257,75,75,11,24,13,11,0,0,0,466,487,3,3,70,30,96,4,20,34,71,59,28,85,15,6,1,6,12,22,6,20,59,17,0,0,0,0,0,0,5,41,0,3,55,55,56,14,4,21,3,0,8,10,11,0,2,11,63,97,8940.0,115.0,705.0,270,309,318,354,11581,15,32,17,22,9,3,0,0,0,9,28,20,25,12,5,0,1,0,35,2,59,8,45,55,3,76,13,0,0,0,0,0,8,2,2,1,12,13,0,33,70,79,63,75,59,85,70,80,0,5,11,9,6,7,15,0,3,18,0,13,14,0,5,0,0,2,33,4,1,1,15,6,2,0,0,17,6,11,3,10,2,0,5,71,12,0,120,2,11,47,13,13,9,6,4,23,3,18,5,0,0,0,13,29,0,10,11,58,22,0,1,0,27,0,2,1,0,2,3,0,0,0,3,0,2,78,97,1,0,2,93,51,15,10,47,0,0,0,13,32,68,0,0,87,0,4,7,0,2,99,0,99,99,96,6,2,4,6,17,38,6,12,70.0,6,3,10.0,15.0,13.0,6,11.666667,80577,1,2,35,1
47829,0,61.611649,3,5,0,1,27,28,22,8,11,1,1,3018,881,1122,0,0,99,50,50,93,6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,30,36,39,32,40,43,26,41,37,23,12,32,23,14,10,6,3,20,13,31,20,16,15,6,12,17,35,48,27,8,2,68,8,5,19,194,268,66,66,0,0,0,0,0,0,0,832,877,3,2,84,16,93,7,2,41,79,70,36,89,11,5,2,4,12,13,5,20,64,9,1,4,1,2,6,23,63,87,0,1,48,53,56,15,4,0,0,33,8,0,0,8,5,12,27,61,6640.0,351.0,560.0,326,356,352,386,12220,13,19,23,26,15,3,1,0,0,8,17,23,29,18,3,1,0,0,16,8,34,5,49,51,5,75,22,0,0,0,0,1,2,0,1,78,26,28,4,82,78,85,72,82,69,75,66,58,99,4,8,6,6,13,19,0,2,8,4,17,10,3,3,4,0,12,16,2,7,4,20,2,6,2,0,7,5,6,8,8,11,1,7,72,1,0,120,10,17,33,14,13,10,2,1,29,1,20,9,1,1,2,13,27,0,28,21,22,25,0,12,0,4,0,0,9,0,0,0,0,0,2,0,0,0,81,97,2,0,1,96,78,35,7,14,5,34,43,76,89,11,0,0,1,30,44,14,0,12,10,90,1,97,95,8,2,5,4,27,70,10,25,132.0,7,5,5.0,40.0,40.0,4,18.857143,17452,1,2,43,1
62152,0,71.000000,3,9,13,0,32,24,28,15,3,1,9,1052,305,424,99,0,0,47,53,81,2,0,13,9,0,4,5,1,1,0,0,6,0,0,3,40,46,47,39,49,51,23,45,32,23,5,15,26,19,13,12,10,21,19,25,21,14,30,11,27,23,38,39,22,7,2,68,8,6,18,170,247,95,93,3,5,2,1,0,0,0,4778,4675,6,6,90,10,97,3,0,34,72,66,30,85,15,3,0,3,12,19,5,14,48,19,0,2,0,95,98,99,99,99,86,1,78,63,65,14,4,4,0,0,6,3,1,0,63,73,80,88,4480.0,13.0,803.0,792,981,849,987,35530,5,2,7,14,17,18,7,13,15,2,1,7,6,15,19,11,17,21,20,2,64,5,24,76,3,84,5,2,2,0,0,0,4,5,1,4,21,26,6,69,71,82,62,80,62,71,66,99,0,0,38,17,3,15,11,2,1,5,3,4,0,1,0,4,0,2,7,2,1,4,8,16,4,5,5,10,10,20,2,15,3,1,15,57,6,2,160,1,3,10,18,10,27,31,12,19,3,18,10,0,0,0,14,32,0,24,22,28,4,1,4,1,2,0,1,3,1,1,0,0,0,2,0,1,9,58,84,9,6,0,97,75,27,14,44,0,0,0,4,13,87,0,0,93,1,5,0,1,0,99,0,99,99,99,15,2,9,10,15,33,7,13,77.0,4,3,10.0,25.0,23.0,6,19.250000,147753,1,2,1,1
76058,1,49.000000,2,5,1,0,24,26,16,1,0,2,9,692,191,235,99,0,0,46,54,90,0,1,1,13,0,0,0,0,0,0,0,12,0,0,1,30,40,44,33,43,47,29,48,34,18,14,25,18,17,12,10,5,19,22,29,19,11,23,6,20,15,31,54,33,15,6,64,11,7,18,219,294,88,88,0,1,0,0,0,0,0,452,476,3,3,75,25,91,9,0,46,81,67,38,91,9,8,2,6,9,16,3,22,53,15,0,0,0,0,0,2,9,40,0,1,36,48,51,16,5,1,0,10,20,0,0,4,5,25,72,88,1920.0,109.0,623.0,301,318,309,343,9750,32,10,19,23,11,5,0,0,0,23,8,23,24,17,4,0,0,0,31,5,18,27,51,49,1,54,36,0,0,0,0,0,1,8,2,9,24,35,18,68,61,85,38,83,38,46,32,71,99,4,4,7,0,14,11,0,2,10,0,30,4,13,5,0,0,13,20,8,0,13,25,10,0,0,2,4,5,0,0,1,0,2,10,81,5,0,113,26,31,28,13,0,2,0,5,16,4,15,2,0,0,0,11,24,0,26,12,16,47,0,2,2,2,0,0,5,0,0,0,0,0,0,0,0,7,69,88,12,0,0,90,63,26,15,31,0,9,9,13,53,47,0,0,77,6,17,0,0,0,99,0,89,99,85,6,2,2,6,10,24,4,9,30.0,2,1,10.0,20.0,20.0,8,15.000000,117054,1,1,26,1


In [19]:
category_1_up.shape

(90569, 316)

In [20]:
data = pd.concat([category_0, category_1_up], axis=0)
#shuffling the data
data = data.sample(frac=1)
data['TARGET_B'].value_counts()

1    90569
0    90569
Name: TARGET_B, dtype: int64

## Upsampling using SMOTE

<br>install it using one of the following
- conda install -c conda-forge imbalanced-learn
- conda install -c glemaitre imbalanced-learn

The SMOTE algorithm can be broken down into following steps:

+ Randomly pick a point from the minority class.
+ Compute the k-nearest neighbors (for some pre-specified k) for this point.
+ Add k new points somewhere between the chosen point and each of its neighbors.

In [21]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [22]:
data = pd.concat([numerical, targets], axis=1)
data = data.drop(['TARGET_D'], axis=1)
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)

In [23]:
y.value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [24]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

## Downsampling using Tomeklinks

+ TomekLinks are pairs of very close instances, but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process.
+ It does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.

In [34]:
data = pd.concat([numerical, targets], axis=1)
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
y.value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [35]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_resample(X, y)
y_tl.value_counts()

0    88000
1     4843
Name: TARGET_B, dtype: int64