# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
from boruta import BorutaPy
from datetime import datetime
from sklearn import preprocessing

# Read training data 

In [2]:
train_values = pd.read_csv("Train_values.csv")
train_labels=pd.read_csv("Train_labels.csv")

# Merge labels with training data

In [3]:
train=train_values.merge(train_labels,on='id',how='inner')

# Read test data 

In [4]:
test=pd.read_csv('Test_values.csv')
test_copy=test.copy()

In [5]:
train.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


# Find non-null values

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
id                       59400 non-null int64
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
public_meeting           56066 non-null object
r

# convert status group label into numerical data

In [7]:
val_status_group={'functional':0, 'functional needs repair':2,
                  'non functional':1}
train['status_group_vals']=train.status_group.replace(val_status_group)

In [8]:
train.status_group_vals

0        0
1        0
2        0
3        1
4        0
5        0
6        1
7        1
8        1
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       1
17       1
18       2
19       0
20       0
21       0
22       2
23       0
24       0
25       2
26       0
27       0
28       1
29       0
        ..
59370    0
59371    1
59372    1
59373    0
59374    0
59375    0
59376    1
59377    1
59378    0
59379    0
59380    1
59381    1
59382    1
59383    0
59384    1
59385    0
59386    0
59387    0
59388    0
59389    0
59390    0
59391    1
59392    1
59393    0
59394    1
59395    0
59396    0
59397    0
59398    0
59399    0
Name: status_group_vals, Length: 59400, dtype: int64

# Lets focus on those feature one by one having null values

In [9]:
#funder
train['funder'].value_counts()

Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
Kkkt                      1287
World Vision              1246
Unicef                    1057
Tasaf                      877
District Council           843
Dhv                        829
Private Individual         826
Dwsp                       811
0                          777
Norad                      765
Germany Republi            610
Tcrs                       602
Ministry Of Water          590
Water                      583
Dwe                        484
Netherlands                470
Hifab                      450
Adb                        448
Lga                        442
Amref                      425
Fini Water                 393
Oxfam                      359
Wateraid                   333
Rc Church                  321
Isf                        316
                          ... 
Masai Land                   1
Methodis

In [10]:
#Lets divide funder column into 5 categories, top 5 will be remained as it is and after that they will be categorised into other
def funder_cleaning(train):
    if train['funder']=='Government Of Tanzania':
        return 'government'
    elif train['funder']=='Danida':
        return 'danida'
    elif train['funder']=='Hesawa':
        return 'hesawa'
    elif train['funder']=='Rwssp':
        return 'rwssp'
    elif train['funder']=='World Bank':
        return 'world_bank'    
    else:
        return 'other'
    
train['funder']= train.apply(lambda row: funder_cleaning(row), axis=1)
test['funder']= test.apply(lambda row: funder_cleaning(row), axis=1)

In [11]:
#Lets focus on each field one by one
#installer
train['installer'].value_counts()

DWE                           17402
Government                     1825
RWE                            1206
Commu                          1060
DANIDA                         1050
KKKT                            898
Hesawa                          840
0                               777
TCRS                            707
Central government              622
CES                             610
Community                       553
DANID                           552
District Council                551
HESAWA                          539
LGA                             408
World vision                    408
WEDECO                          397
TASAF                           396
District council                392
Gover                           383
AMREF                           329
TWESA                           316
WU                              301
Dmdd                            287
ACRA                            278
World Vision                    270
SEMA                        

In [12]:
#Lets divide installer column into 5 categories, top 5 will be remained as it is and after that they will be categorised into other
def installer_cleaning(train):
    if train['installer']=='DWE':
        return 'dwe'
    elif train['installer']=='Government':
        return 'government'
    elif train['installer']=='RWE':
        return 'rwe'
    elif train['installer']=='Commu':
        return 'commu'
    elif train['installer']=='DANIDA':
        return 'danida'    
    else:
        return 'other'
    
train['installer']= train.apply(lambda row: installer_cleaning(row), axis=1)
test['installer']= test.apply(lambda row: installer_cleaning(row), axis=1)

In [13]:
#EDA
# Checking Null Values on training data
train.apply(lambda x: sum(x.isnull()), axis=0)

id                           0
amount_tsh                   0
date_recorded                0
funder                       0
gps_height                   0
installer                    0
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [14]:
piv_table  = pd.pivot_table(train,index=['funder','status_group'],
                           values='status_group_vals', aggfunc='count')
piv_table

Unnamed: 0_level_0,Unnamed: 1_level_0,status_group_vals
funder,status_group,Unnamed: 2_level_1
danida,functional,1713
danida,functional needs repair,159
danida,non functional,1242
government,functional,3720
government,functional needs repair,701
government,non functional,4663
hesawa,functional,936
hesawa,functional needs repair,232
hesawa,non functional,1034
other,functional,24540


In [15]:
#Like funder and installer it is hard to categorize them into 5 or 6 subvillages because values are not dominating 
#and 19287 unique values are there and the top values are not dominating though

train['subvillage'].value_counts()

#better to drop this column

Madukani           508
Shuleni            506
Majengo            502
Kati               373
Mtakuja            262
Sokoni             232
M                  187
Muungano           172
Mbuyuni            164
Mlimani            152
Songambele         147
Msikitini          134
Miembeni           134
1                  132
Kibaoni            114
Kanisani           111
Mapinduzi          109
I                  109
Mjini              108
Mjimwema           108
Mkwajuni           104
Mwenge             102
Mabatini            98
Azimio              98
Mission             95
Mbugani             95
Bwawani             91
Bondeni             90
Chang'Ombe          88
Zahanati            86
                  ... 
Ilojaminzi           1
Mugharu              1
Nyasirori            1
Matumaini            1
Busi B               1
Uyogu                1
Cheketu              1
Mwanjagala           1
Kiruku Makoooni      1
Mwabulandi           1
Iyeze                1
Ndayanjoju           1
Mishenyi   

In [16]:
train=train.drop(['subvillage'],axis=1)
test=test.drop(['subvillage'],axis=1)

In [17]:
#public_meeting
train['public_meeting'].value_counts()

True     51011
False     5055
Name: public_meeting, dtype: int64

In [18]:
#Since most of the values are True, as of now lets insert True for the missing values. Scope to alter the values in future
train.public_meeting = train.public_meeting.fillna('Unknown')
test.public_meeting = test.public_meeting.fillna('Unknown')

In [19]:
#Scheme Management
train['scheme_management'].value_counts()

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [20]:
# Create a function to reduce the amount of dummy columns needed whilst maintaining the 
# information contained in the column.

def scheme_wrangler(row):
    '''Keep top 5 values and set the rest to 'other'. '''
    if row['scheme_management']=='VWC':
        return 'vwc'
    elif row['scheme_management']=='WUG':
        return 'wug'
    elif row['scheme_management']=='Water authority':
        return 'wtr_auth'
    elif row['scheme_management']=='WUA':
        return 'wua'
    elif row['scheme_management']=='Water Board':
        return 'wtr_brd'
    else:
        return 'other'

train['scheme_management'] = train.apply(lambda row: scheme_wrangler(row), axis=1)
test['scheme_management'] = test.apply(lambda row: scheme_wrangler(row), axis=1)

In [21]:
#Scheme name
train['scheme_name'].value_counts()

K                                                 682
None                                              644
Borehole                                          546
Chalinze wate                                     405
M                                                 400
DANIDA                                            379
Government                                        320
Ngana water supplied scheme                       270
wanging'ombe water supply s                       261
wanging'ombe supply scheme                        234
I                                                 229
Bagamoyo wate                                     229
Uroki-Bomang'ombe water sup                       209
N                                                 204
Kirua kahe gravity water supply trust             193
Machumba estate pipe line                         185
Makwale water supplied sche                       166
Kijiji                                            161
S                           

In [22]:
len(train.scheme_name.unique())

# Lots of factors and the top 5 or so only represent a fraction of the total values. Probably 
# safe to drop this column.

train = train.drop('scheme_name', axis=1)
test = test.drop('scheme_name', axis=1)

In [23]:
#permit
train['permit'].value_counts()

True     38852
False    17492
Name: permit, dtype: int64

In [24]:
# We only have two values here: true and false. This one can stay but we'll have to replace 
# the unknown data with a string value.

train.permit = train.permit.fillna('Unknown')
test.permit = test.permit.fillna('Unknown')

In [25]:
#EDA
# Checking Null Values on train data
train.apply(lambda x: sum(x.isnull()), axis=0)

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
w

In [26]:
#From the above data, we can say the data is clean for both the training and testing.

In [27]:
#checking the correlation values of training datatest.corr()
train.corr()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,status_group_vals
id,1.0,-0.005321,-0.004692,-0.001348,0.001718,-0.002629,-0.003028,-0.003044,-0.002813,-0.002082,0.003354
amount_tsh,-0.005321,1.0,0.07665,0.022134,-0.05267,0.002944,-0.026813,-0.023599,0.016288,0.067915,-0.043533
gps_height,-0.004692,0.07665,1.0,0.149155,-0.035751,0.007237,-0.183521,-0.171233,0.135003,0.658727,-0.096408
longitude,-0.001348,0.022134,0.149155,1.0,-0.425802,0.023873,0.034197,0.151398,0.08659,0.396732,-0.071635
latitude,0.001718,-0.05267,-0.035751,-0.425802,1.0,0.006837,-0.221018,-0.20102,-0.022152,-0.245278,0.02102
num_private,-0.002629,0.002944,0.007237,0.023873,0.006837,1.0,-0.020377,-0.004478,0.003818,0.026056,-0.006159
region_code,-0.003028,-0.026813,-0.183521,0.034197,-0.221018,-0.020377,1.0,0.678602,0.094088,0.031724,0.08359
district_code,-0.003044,-0.023599,-0.171233,0.151398,-0.20102,-0.004478,0.678602,1.0,0.061831,0.048315,0.033979
population,-0.002813,0.016288,0.135003,0.08659,-0.022152,0.003818,0.094088,0.061831,1.0,0.26091,-0.015198
construction_year,-0.002082,0.067915,0.658727,0.396732,-0.245278,0.026056,0.031724,0.048315,0.26091,1.0,-0.056893


In [28]:
#checking the correlation values of testing datatest.corr()
test.corr()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
id,1.0,-0.016012,0.002711,-0.003331,0.009632,-0.01604,0.002601,-0.000655,-0.016068,-0.004591
amount_tsh,-0.016012,1.0,0.09623,0.027709,-0.07221,0.022978,-0.028887,-0.027391,0.028657,0.086236
gps_height,0.002711,0.09623,1.0,0.148722,-0.045028,0.013331,-0.177832,-0.169586,0.131994,0.656781
longitude,-0.003331,0.027709,0.148722,1.0,-0.430001,0.030979,0.024354,0.148962,0.090862,0.397361
latitude,0.009632,-0.07221,-0.045028,-0.430001,1.0,0.002048,-0.20308,-0.196176,-0.024123,-0.25661
num_private,-0.01604,0.022978,0.013331,0.030979,0.002048,1.0,-0.028188,-0.010341,0.001517,0.036118
region_code,0.002601,-0.028887,-0.177832,0.024354,-0.20308,-0.028188,1.0,0.68791,0.097118,0.029993
district_code,-0.000655,-0.027391,-0.169586,0.148962,-0.196176,-0.010341,0.68791,1.0,0.066425,0.046214
population,-0.016068,0.028657,0.131994,0.090862,-0.024123,0.001517,0.097118,0.066425,1.0,0.272421
construction_year,-0.004591,0.086236,0.656781,0.397361,-0.25661,0.036118,0.029993,0.046214,0.272421,1.0


As we can see, the values are not correlated with each other. That is good for the model

In [29]:
train['recorded_by'].value_counts()
#we can drop this column because all the values in this column are same. There is no point to go forward with this column.

GeoData Consultants Ltd    59400
Name: recorded_by, dtype: int64

In [30]:
train=train.drop(['recorded_by'],axis=1)
test=test.drop(['recorded_by'],axis=1)

In [31]:
train.apply(lambda x: len(x.unique()))

id                       59400
amount_tsh                  98
date_recorded              356
funder                       6
gps_height                2428
installer                    6
longitude                57516
latitude                 57517
wpt_name                 37400
num_private                 65
basin                        9
region                      21
region_code                 27
district_code               20
lga                        125
ward                      2092
population                1049
public_meeting               3
scheme_management            6
permit                       3
construction_year           55
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source  

In [32]:
train['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [33]:
train['waterpoint_type_group'].value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

In [34]:
# waterpoint_type and waterpoint_type_group both are almost similar, communal standpipe and comunal standpipe multiple are merged 
# together in waterpoint_type_group
# we can drop one of them

train=train.drop(['waterpoint_type'],axis=1)
test=test.drop(['waterpoint_type'],axis=1)

In [35]:
# source
train['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [36]:
train['source_type'].value_counts()

spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

In [37]:
train['source_class'].value_counts()

groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

In [38]:
#from the above script, we can say the source and source_type are same because in source_type some values are merged together. 
# we can drop one of the column.

train=train.drop(['source'],axis=1)
test=test.drop(['source'],axis=1)

In [39]:
train['quantity'].value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity, dtype: int64

In [40]:
train['quantity_group'].value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity_group, dtype: int64

In [41]:
#from the above script, we can say the quantity and quantity_group both are same.
# we can drop one of the column.

train=train.drop(['quantity'],axis=1)
test=test.drop(['quantity'],axis=1)

In [42]:
train['water_quality'].value_counts()

soft                  50818
salty                  4856
unknown                1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64

In [43]:
train['quality_group'].value_counts()

good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

In [44]:
#water_quality and quality_group are correlated . lets drop one of the column
train=train.drop(['quality_group'],axis=1)
test=test.drop(['quality_group'],axis=1)

In [45]:
train['payment'].value_counts()

never pay                25348
pay per bucket            8985
pay monthly               8300
unknown                   8157
pay when scheme fails     3914
pay annually              3642
other                     1054
Name: payment, dtype: int64

In [46]:
train['payment_type'].value_counts()

never pay     25348
per bucket     8985
monthly        8300
unknown        8157
on failure     3914
annually       3642
other          1054
Name: payment_type, dtype: int64

In [47]:
#from the above script, we can say the payment and payment_type both are same.
# we can drop one of the column.

train=train.drop(['payment'],axis=1)
test=test.drop(['payment'],axis=1)

In [48]:
train['management'].value_counts()

vwc                 40507
wug                  6515
water board          2933
wua                  2535
private operator     1971
parastatal           1768
water authority       904
other                 844
company               685
unknown               561
other - school         99
trust                  78
Name: management, dtype: int64

In [49]:
train['management_group'].value_counts()

user-group    52490
commercial     3638
parastatal     1768
other           943
unknown         561
Name: management_group, dtype: int64

In [50]:
#from the above script, we can say the management and management_group both are same, some values are merged together in management-group.
# we can drop one of the column.

train=train.drop(['management'],axis=1)
test=test.drop(['management'],axis=1)

In [51]:
train['extraction_type'].value_counts()

gravity                      26780
nira/tanira                   8154
other                         6430
submersible                   4764
swn 80                        3670
mono                          2865
india mark ii                 2400
afridev                       1770
ksb                           1415
other - rope pump              451
other - swn 81                 229
windmill                       117
india mark iii                  98
cemo                            90
other - play pump               85
walimi                          48
climax                          32
other - mkulima/shinyanga        2
Name: extraction_type, dtype: int64

In [52]:
train['extraction_type_group'].value_counts()

gravity            26780
nira/tanira         8154
other               6430
submersible         6179
swn 80              3670
mono                2865
india mark ii       2400
afridev             1770
rope pump            451
other handpump       364
other motorpump      122
wind-powered         117
india mark iii        98
Name: extraction_type_group, dtype: int64

In [53]:
train['extraction_type_class'].value_counts()

gravity         26780
handpump        16456
other            6430
submersible      6179
motorpump        2987
rope pump         451
wind-powered      117
Name: extraction_type_class, dtype: int64

In [54]:
#from the above script, we can say the exrtaction_type, extraction_type_group and extraction_type_class are same, some values are merged together.

train=train.drop(['extraction_type'],axis=1)
test=test.drop(['extraction_type'],axis=1)

train=train.drop(['extraction_type_group'],axis=1)
test=test.drop(['extraction_type_group'],axis=1)

In [55]:
# gps_height, longitude, latitude, region_code and district_code are all geographic info which# gps_he 
# is unlikely to add any predictive power to the model given that there are other variables
# containing geographic data. 'num_private' hasn't been given a discription on Driven Data,
# it appears to be superflous. We expect id to not contain any useful information so that gets
# dropped too. wpt_name is also not required, it gives only waterpoint name

train = train.drop(['gps_height', 'longitude', 'latitude', 'region_code', 'district_code',
             'num_private', 'id','wpt_name','lga','region','ward','status_group'], axis=1)

test = test.drop(['gps_height', 'longitude', 'latitude', 'region_code', 'district_code',
             'num_private', 'id','wpt_name','lga','region','ward'], axis=1)

In [56]:
# Turn construction_year into a categorical column containing the following values: '60s', '70s',
# '80s', '90s, '00s', '10s', 'unknown'.

def construction_wrangler(row):
    if row['construction_year'] >= 1960 and row['construction_year'] < 1970:
        return '60s'
    elif row['construction_year'] >= 1970 and row['construction_year'] < 1980:
        return '70s'
    elif row['construction_year'] >= 1980 and row['construction_year'] < 1990:
        return '80s'
    elif row['construction_year'] >= 1990 and row['construction_year'] < 2000:
        return '90s'
    elif row['construction_year'] >= 2000 and row['construction_year'] < 2010:
        return '00s'
    elif row['construction_year'] >= 2010:
        return '10s'
    else:
        return 'unknown'
    
train['construction_year'] = train.apply(lambda row: construction_wrangler(row), axis=1)
test['construction_year'] = test.apply(lambda row: construction_wrangler(row), axis=1)

In [57]:
train['population'].value_counts()

0       21381
1        7025
200      1940
150      1892
250      1681
300      1476
100      1146
50       1139
500      1009
350       986
120       916
400       775
60        706
30        626
40        552
80        533
450       499
20        462
600       438
230       388
75        289
1000      278
800       269
90        265
130       264
25        255
320       249
35        245
360       222
140       215
        ...  
8848        1
628         1
4520        1
468         1
693         1
725         1
789         1
821         1
5300        1
3127        1
2345        1
3031        1
886         1
392         1
424         1
2807        1
726         1
694         1
2569        1
4788        1
662         1
4660        1
406         1
1032        1
1160        1
3241        1
1960        1
1685        1
2248        1
1439        1
Name: population, Length: 1049, dtype: int64

In [58]:
test.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(test.date_recorded)
test.columns = ['days_since_recorded' if x=='date_recorded' else x for x in test.columns]
test.days_since_recorded = test.days_since_recorded.astype('timedelta64[D]').astype(int)

train.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(train.date_recorded)
train.columns = ['days_since_recorded' if x=='date_recorded' else x for x in train.columns]
train.days_since_recorded = train.days_since_recorded.astype('timedelta64[D]').astype(int)

In [59]:
train.apply(lambda x: len(x.unique()))

amount_tsh                 98
days_since_recorded       356
funder                      6
installer                   6
basin                       9
population               1049
public_meeting              3
scheme_management           6
permit                      3
construction_year           7
extraction_type_class       7
management_group            5
payment_type                7
water_quality               8
quantity_group              5
source_type                 7
source_class                3
waterpoint_type_group       6
status_group_vals           3
dtype: int64

In [60]:
test.apply(lambda x: len(x.unique()))

amount_tsh                68
days_since_recorded      331
funder                     6
installer                  6
basin                      9
population               637
public_meeting             3
scheme_management          6
permit                     3
construction_year          7
extraction_type_class      7
management_group           5
payment_type               7
water_quality              8
quantity_group             5
source_type                7
source_class               3
waterpoint_type_group      6
dtype: int64

In [61]:
train.shape

(59400, 19)

In [62]:
test.shape

(14850, 18)

In [63]:
train.keys()

Index(['amount_tsh', 'days_since_recorded', 'funder', 'installer', 'basin',
       'population', 'public_meeting', 'scheme_management', 'permit',
       'construction_year', 'extraction_type_class', 'management_group',
       'payment_type', 'water_quality', 'quantity_group', 'source_type',
       'source_class', 'waterpoint_type_group', 'status_group_vals'],
      dtype='object')

In [64]:
test.keys()

Index(['amount_tsh', 'days_since_recorded', 'funder', 'installer', 'basin',
       'population', 'public_meeting', 'scheme_management', 'permit',
       'construction_year', 'extraction_type_class', 'management_group',
       'payment_type', 'water_quality', 'quantity_group', 'source_type',
       'source_class', 'waterpoint_type_group'],
      dtype='object')

In [65]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 19 columns):
amount_tsh               59400 non-null float64
days_since_recorded      59400 non-null int32
funder                   59400 non-null object
installer                59400 non-null object
basin                    59400 non-null object
population               59400 non-null int64
public_meeting           59400 non-null object
scheme_management        59400 non-null object
permit                   59400 non-null object
construction_year        59400 non-null object
extraction_type_class    59400 non-null object
management_group         59400 non-null object
payment_type             59400 non-null object
water_quality            59400 non-null object
quantity_group           59400 non-null object
source_type              59400 non-null object
source_class             59400 non-null object
waterpoint_type_group    59400 non-null object
status_group_vals        59400 non-null int64


In [66]:
# Get dummy columns for the categorical columns and shuffle the data.

dummy_cols = ['funder', 'installer', 'basin', 'public_meeting', 'scheme_management', 'permit',
              'construction_year', 'extraction_type_class','management_group', 'payment_type', 'water_quality',
              'quantity_group', 'source_type', 'source_class','waterpoint_type_group']

train = pd.get_dummies(train, columns = dummy_cols)

train = train.sample(frac=1).reset_index(drop=True)

In [67]:
test = pd.get_dummies(test, columns = dummy_cols)

In [68]:
print('Shape of training data',train.shape)
print('Shape of testing data',test.shape)

Shape of training data (59400, 92)
Shape of testing data (14850, 91)


In [69]:
train.head()

Unnamed: 0,amount_tsh,days_since_recorded,population,status_group_vals,funder_danida,funder_government,funder_hesawa,funder_other,funder_rwssp,funder_world_bank,...,source_type_spring,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,200.0,1009,150,2,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
1,0.0,415,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
2,0.0,1001,56,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
3,200.0,1005,230,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
4,6000.0,280,500,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [70]:
test.head()

Unnamed: 0,amount_tsh,days_since_recorded,population,funder_danida,funder_government,funder_hesawa,funder_other,funder_rwssp,funder_world_bank,installer_commu,...,source_type_spring,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,0.0,302,321,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0.0,302,300,0,1,0,0,0,0,0,...,1,1,0,0,0,1,0,0,0,0
2,0.0,305,500,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0.0,315,250,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,500.0,251,60,0,0,0,1,0,0,0,...,1,1,0,0,0,1,0,0,0,0


In [71]:
#profiling report of training data
#pandas_profiling.ProfileReport(train)

#from profiling report, we can drop duplicate rows from the training dataset and also we can see 
#waterpoint_type_group_hand pump is highly correlated with extraction_type_class_handpump. we can drop any one of the column

#train=train.drop_duplicates()

train=train.drop(['waterpoint_type_group_hand pump'],axis=1)
test=test.drop(['waterpoint_type_group_hand pump'],axis=1)

train=train.drop(['source_type_other'],axis=1)
test=test.drop(['source_type_other'],axis=1)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 90 columns):
amount_tsh                                  59400 non-null float64
days_since_recorded                         59400 non-null int32
population                                  59400 non-null int64
status_group_vals                           59400 non-null int64
funder_danida                               59400 non-null uint8
funder_government                           59400 non-null uint8
funder_hesawa                               59400 non-null uint8
funder_other                                59400 non-null uint8
funder_rwssp                                59400 non-null uint8
funder_world_bank                           59400 non-null uint8
installer_commu                             59400 non-null uint8
installer_danida                            59400 non-null uint8
installer_dwe                               59400 non-null uint8
installer_government                        

In [72]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [73]:
# Let's split the train set into train and validation sets. Also remove the target.

target = train.status_group_vals
features = train.drop('status_group_vals', axis=1)

features1=features.copy()

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import train_test_split



In [75]:
rf = RandomForestClassifier(criterion='gini',
                                n_estimators=300,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1,
                                max_depth=6)

X_boruta=features.values
y_boruta=target.values

boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2)
start_time = timer(None)
boruta_selector.fit(X_boruta,y_boruta)
timer(start_time)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	89
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	89
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	89
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	89
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	89
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	89
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	89
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	13


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	9 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	13


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	10 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	13


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	11 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	13


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	12 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	13


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	13 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	13


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	14 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	13


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	15 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	14


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	16 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	14


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	17 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	14


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	18 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	14


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	19 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	14


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	20 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	14


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	21 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	14


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	22 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	23 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	24 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	25 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	26 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	27 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	28 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	29 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	30 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	31 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	32 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	33 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	34 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	35 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	36 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	37 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	38 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	39 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	40 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	41 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	42 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	43 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	44 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	45 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	46 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	47 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	48 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	49 / 100
Confirmed: 	70
Tentative: 	4
Rejected: 	15


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	50 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	51 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	52 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	53 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	54 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	55 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	56 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	57 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	58 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	59 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	60 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	61 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	62 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	63 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	64 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	65 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	66 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	67 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	68 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	69 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	70 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	71 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	72 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	73 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	74 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	75 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	76 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	77 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	78 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	79 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	80 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	81 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	82 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	83 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	84 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	85 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	86 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	87 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	88 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	89 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	90 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	91 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	92 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	93 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	94 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	95 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	96 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	97 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	98 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16
Iteration: 	99 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16


BorutaPy finished running.

Iteration: 	100 / 100
Confirmed: 	70
Tentative: 	3
Rejected: 	16

 Time taken: 0 hours 23 minutes and 0.87 seconds.


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


In [76]:
# number of selected features
print ('\n Number of selected features:')
print (boruta_selector.n_features_)


 Number of selected features:
70


In [77]:
features1=pd.DataFrame(features.columns.tolist())
features1['rank']=boruta_selector.ranking_
features1 = features1.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % boruta_selector.n_features_)
print (features1.head(boruta_selector.n_features_))

#From Boruta, we retained the 70 columns. The remaining columns whose rank is greater than 1 is not required. 


 Top 70 features:
                                           0  rank
0                                 amount_tsh     1
1                     payment_type_never pay     1
2                       payment_type_monthly     1
3                      payment_type_annually     1
4                management_group_user-group     1
5                management_group_commercial     1
6          extraction_type_class_submersible     1
7                    payment_type_per bucket     1
8                extraction_type_class_other     1
9      waterpoint_type_group_improved spring     1
10             extraction_type_class_gravity     1
11                 construction_year_unknown     1
12                     construction_year_90s     1
13                     construction_year_80s     1
14                     construction_year_70s     1
15           extraction_type_class_motorpump     1
16                     construction_year_10s     1
17                      payment_type_unknown     1
18          

In [78]:
features1=pd.DataFrame(features.columns.tolist())
features1['rank']=boruta_selector.ranking_
features1 = features1.sort_values('rank', ascending=True).reset_index(drop=True)
features1

Unnamed: 0,0,rank
0,amount_tsh,1
1,payment_type_never pay,1
2,payment_type_monthly,1
3,payment_type_annually,1
4,management_group_user-group,1
5,management_group_commercial,1
6,extraction_type_class_submersible,1
7,payment_type_per bucket,1
8,extraction_type_class_other,1
9,waterpoint_type_group_improved spring,1


In [79]:
#columns are dropped. For these columns Boruta didnt predcit the rank 1
features=features.drop(['water_quality_milky'
    ,'payment_type_on failure'
    ,'funder_hesawa'
    ,'construction_year_60s'
    ,'management_group_parastatal'
    ,'water_quality_salty abandoned'
    ,'management_group_unknown'
    ,'management_group_other'
    ,'extraction_type_class_rope pump'
    ,'payment_type_other'
    ,'installer_danida'
    ,'water_quality_fluoride'
    ,'source_class_unknown'
    ,'water_quality_coloured'
    ,'extraction_type_class_wind-powered'
    ,'waterpoint_type_group_cattle trough'
    ,'waterpoint_type_group_dam'
    ,'water_quality_fluoride abandoned'],axis=1)


test=test.drop(['water_quality_milky'
    ,'payment_type_on failure'
    ,'funder_hesawa'
    ,'construction_year_60s'
    ,'management_group_parastatal'
    ,'water_quality_salty abandoned'
    ,'management_group_unknown'
    ,'management_group_other'
    ,'extraction_type_class_rope pump'
    ,'payment_type_other'
    ,'installer_danida'
    ,'water_quality_fluoride'
    ,'source_class_unknown'
    ,'water_quality_coloured'
    ,'extraction_type_class_wind-powered'
    ,'waterpoint_type_group_cattle trough'
    ,'waterpoint_type_group_dam'
    ,'water_quality_fluoride abandoned'],axis=1)

In [80]:
test_id = pd.DataFrame
test_id=test_copy['id']

In [82]:
def model_for_submission(features, target, test):
    if __name__ == '__main__':

        rf = RandomForestClassifier(criterion='gini',
                                n_estimators=700,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

        
        param_grid = {"min_samples_split" : [6],
             "n_estimators" : [700]}

        estimator = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

        estimator.fit(features, target)        

        predictions = estimator.predict(test)
        
       
        data = {'ID': test_id, 'status_group': predictions}

        submit = pd.DataFrame(data=data)

        vals_to_replace = {0:'functional', 2:'functional needs repair',
                           1:'non functional'}

        submit.status_group = submit.status_group.replace(vals_to_replace)        

        submit.to_csv('Submission_file6.csv', index=False)

In [83]:
# Run model for submission.
model_for_submission(features, target, test)

In [84]:
result_normalize4 = pd.read_csv('Submission_file6.csv') 

In [85]:
result_normalize4['status_group'].value_counts() #submission 6 accuracy 80.44

functional                 8930
non functional             5316
functional needs repair     604
Name: status_group, dtype: int64

In [86]:
X_train, X_val, y_train, y_val = train_test_split(features, target, train_size=0.8)

In [87]:
def GradientBoostingClassifier_random_forest_model(X_train, X_val, y_train, y_val, test):
    if __name__ == '__main__':
        
        rf = RandomForestClassifier(criterion='gini',
                                n_estimators=500,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

        
        param_grid = {"min_samples_split" : [6,7,8],
             "n_estimators" : [500,700,1000]}

        estimator = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

        
        estimator.fit(X_train, y_train)

        best_params = estimator.best_params_
                                 
        validation_accuracy = estimator.score(X_val, y_val)
        print('Validation accuracy: ', validation_accuracy)
        print(best_params)

In [89]:
GradientBoostingClassifier_random_forest_model(X_train, X_val, y_train, y_val, test)

Validation accuracy:  0.7987373737373737
{'min_samples_split': 8, 'n_estimators': 700}
