<a href="https://colab.research.google.com/github/TanushGoel/Pump-It-Up/blob/master/Pump_It_Up_Water_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
pd.options.mode.use_inf_as_na = False
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [0]:
#MICE was pretty bad at guessing construction year
train = pd.read_csv('PumpItUpTokenizedCleanedMICE_train.csv')
train.status_group = train.status_group.astype(int)
target = train.status_group
train_id = train.id
train = train.drop('id', axis=1)
train.public_meeting = np.round((train.public_meeting-0.175)).astype(int)
train.permit = np.round((train.permit-0.175)).astype(int)
train.head(50)

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,6000.0,40616,0,1390,0,34.938093,-9.856322,0,0,0,0,0,11,5,0,0,109,1,0,0,1999.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0.0,41339,1,1399,1,34.698766,-2.147466,1,0,1,1,1,20,2,1,1,280,1,1,1,2010.0,0,0,0,1,0,1,1,0,0,1,1,1,1,1,0,0,1
2,25.0,41330,2,686,2,37.460664,-3.821329,2,0,2,2,2,21,4,2,2,250,1,0,1,2009.0,0,0,0,0,0,2,2,0,0,0,0,2,2,1,1,0,1
3,0.0,41302,3,263,3,38.486161,-11.1553,3,0,3,3,3,90,63,3,3,58,1,0,1,1986.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0,-1
4,0.0,40737,4,0,4,31.130847,-1.825359,4,0,1,4,4,18,1,4,4,0,1,-1,1,1684.993408,0,0,0,2,1,1,1,0,0,3,3,1,1,1,0,0,1
5,20.0,40615,5,0,5,39.172796,-4.765587,5,0,2,5,5,4,8,5,5,1,1,0,1,2009.0,1,1,1,0,0,2,2,1,1,0,0,4,4,2,1,0,1
6,0.0,41183,6,0,6,33.36241,-3.766365,6,0,4,6,6,17,3,6,6,0,1,0,1,1841.7854,2,2,2,0,0,1,1,0,0,0,0,3,3,0,2,1,-1
7,0.0,41191,7,0,5,32.620617,-4.226198,7,0,5,7,6,17,3,7,7,0,1,-1,1,1876.895264,3,3,2,1,0,3,3,2,2,0,0,5,5,0,2,1,-1
8,0.0,41216,8,0,7,32.7111,-5.146712,8,0,5,8,7,14,6,8,8,0,1,0,1,1863.421387,4,4,2,0,0,1,1,1,1,3,3,3,3,0,2,1,-1
9,0.0,40758,9,0,4,30.626991,-1.257051,9,0,1,9,4,18,1,4,9,0,1,-1,1,1705.750977,3,3,2,0,0,1,1,0,0,0,0,5,5,0,2,1,1


In [0]:
test = pd.read_csv('PumpItUpTokenizedCleanedMICE_test.csv')
test_id = test.id
test = test.drop('id', axis=1)
test.public_meeting = np.round((test.public_meeting-0.25)).astype(int)
test.permit = np.round((test.permit-0.175)).astype(int)
test.head(50)

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,0.0,41309,162,1996,340,35.290799,-4.059696,37400,0,4,10942,2,21,3,37,573,321,1,8,1,2012.0,5,5,3,8,3,1,1,0,0,3,3,1,1,1,3,2
1,0.0,41309,20,1569,5,36.656709,-3.309214,37401,0,2,19287,16,2,2,26,367,300,1,0,1,2000.0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0
2,0.0,41306,-1,1567,-1,34.767863,-5.004344,21518,0,4,7343,18,13,2,32,647,500,1,0,1,2010.0,5,5,3,0,0,1,1,0,0,1,1,1,1,1,3,2
3,0.0,41296,143,267,129,38.058046,-9.418672,37402,0,3,5578,14,80,43,105,1795,250,1,0,1,1987.0,5,5,3,0,0,3,3,0,0,2,2,5,5,0,3,2
4,500.0,41360,1036,1260,1131,35.006123,-10.95041,2984,0,3,2889,9,10,3,97,653,60,1,4,1,2000.0,0,0,0,4,0,6,6,0,0,0,0,0,0,0,0,0
5,0.0,41337,20,1685,5,36.685279,-3.30242,9429,0,2,7554,16,2,2,26,808,200,1,0,1,1990.0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
6,0.0,40604,20,550,156,36.398041,-7.541382,0,0,7,628,0,11,7,29,445,600,1,0,1,2007.0,4,4,2,0,0,1,1,1,1,0,0,3,3,0,2,1
7,0.0,41299,111,234,102,39.60742,-10.89379,37403,0,3,767,3,9,4,101,1178,1,1,4,1,1982.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0
8,30.0,41297,3,584,25,39.262951,-10.82359,37404,0,3,432,3,90,33,93,880,40,1,0,0,1997.0,0,0,0,0,0,2,2,0,0,1,1,0,0,0,0,0
9,0.0,41351,13,1083,10,37.096108,-3.251754,37405,0,2,16804,10,3,7,12,28,1,1,4,1,2003.0,0,0,0,4,0,6,6,0,0,0,0,0,0,0,0,0


In [0]:
train = reduce_mem_usage(train)

Memory usage of dataframe is 17.22 MB
Memory usage after optimization is: 3.23 MB
Decreased by 81.2%


Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,6000.0,40616,0,1390,0,34.937500,-9.859375,0,0,0,0,0,11,5,0,0,109,1,0,0,1999.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0.0,41339,1,1399,1,34.687500,-2.148438,1,0,1,1,1,20,2,1,1,280,1,1,1,2010.0,0,0,0,1,0,1,1,0,0,1,1,1,1,1,0,0,1
2,25.0,41330,2,686,2,37.468750,-3.822266,2,0,2,2,2,21,4,2,2,250,1,0,1,2009.0,0,0,0,0,0,2,2,0,0,0,0,2,2,1,1,0,1
3,0.0,41302,3,263,3,38.500000,-11.156250,3,0,3,3,3,90,63,3,3,58,1,0,1,1986.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0,-1
4,0.0,40737,4,0,4,31.125000,-1.825195,4,0,1,4,4,18,1,4,4,0,1,-1,1,1685.0,0,0,0,2,1,1,1,0,0,3,3,1,1,1,0,0,1
5,20.0,40615,5,0,5,39.187500,-4.765625,5,0,2,5,5,4,8,5,5,1,1,0,1,2009.0,1,1,1,0,0,2,2,1,1,0,0,4,4,2,1,0,1
6,0.0,41183,6,0,6,33.375000,-3.765625,6,0,4,6,6,17,3,6,6,0,1,0,1,1842.0,2,2,2,0,0,1,1,0,0,0,0,3,3,0,2,1,-1
7,0.0,41191,7,0,5,32.625000,-4.226562,7,0,5,7,6,17,3,7,7,0,1,-1,1,1877.0,3,3,2,1,0,3,3,2,2,0,0,5,5,0,2,1,-1
8,0.0,41216,8,0,7,32.718750,-5.148438,8,0,5,8,7,14,6,8,8,0,1,0,1,1863.0,4,4,2,0,0,1,1,1,1,3,3,3,3,0,2,1,-1
9,0.0,40758,9,0,4,30.625000,-1.256836,9,0,1,9,4,18,1,4,9,0,1,-1,1,1706.0,3,3,2,0,0,1,1,0,0,0,0,5,5,0,2,1,1


In [0]:
test = reduce_mem_usage(test)

Memory usage of dataframe is 4.19 MB
Memory usage after optimization is: 0.79 MB
Decreased by 81.1%


Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,0.0,41309,162,1996,340,35.281250,-4.058594,37400,0,4,10942,2,21,3,37,573,321,1,8,1,2012.0,5,5,3,8,3,1,1,0,0,3,3,1,1,1,3,2
1,0.0,41309,20,1569,5,36.656250,-3.308594,37401,0,2,19287,16,2,2,26,367,300,1,0,1,2000.0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0
2,0.0,41306,-1,1567,-1,34.781250,-5.003906,21518,0,4,7343,18,13,2,32,647,500,1,0,1,2010.0,5,5,3,0,0,1,1,0,0,1,1,1,1,1,3,2
3,0.0,41296,143,267,129,38.062500,-9.421875,37402,0,3,5578,14,80,43,105,1795,250,1,0,1,1987.0,5,5,3,0,0,3,3,0,0,2,2,5,5,0,3,2
4,500.0,41360,1036,1260,1131,35.000000,-10.953125,2984,0,3,2889,9,10,3,97,653,60,1,4,1,2000.0,0,0,0,4,0,6,6,0,0,0,0,0,0,0,0,0
5,0.0,41337,20,1685,5,36.687500,-3.302734,9429,0,2,7554,16,2,2,26,808,200,1,0,1,1990.0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
6,0.0,40604,20,550,156,36.406250,-7.542969,0,0,7,628,0,11,7,29,445,600,1,0,1,2007.0,4,4,2,0,0,1,1,1,1,0,0,3,3,0,2,1
7,0.0,41299,111,234,102,39.593750,-10.890625,37403,0,3,767,3,9,4,101,1178,1,1,4,1,1982.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0
8,30.0,41297,3,584,25,39.250000,-10.820312,37404,0,3,432,3,90,33,93,880,40,1,0,0,1997.0,0,0,0,0,0,2,2,0,0,1,1,0,0,0,0,0
9,0.0,41351,13,1083,10,37.093750,-3.251953,37405,0,2,16804,10,3,7,12,28,1,1,4,1,2003.0,0,0,0,4,0,6,6,0,0,0,0,0,0,0,0,0


In [0]:
train['dist'] = train.longitude ** 2 + train.latitude ** 2
test['dist'] = test.longitude ** 2 + test.latitude ** 2

In [0]:
train['dist2'] = train.gps_height ** 2 + train.dist
test['dist2'] = test.gps_height ** 2 + test.dist

In [0]:
train['wpt_dist_min'] = train.groupby('wpt_name')['dist'].transform('min')
train['wpt_dist_min2'] = train.groupby('wpt_name')['dist2'].transform('min')

test['wpt_dist_min'] = test.groupby('wpt_name')['dist'].transform('min')
test['wpt_dist_min2'] = test.groupby('wpt_name')['dist2'].transform('min')

train['wpt_dist_max'] = train.groupby('wpt_name')['dist'].transform('max')
train['wpt_dist_max2'] = train.groupby('wpt_name')['dist2'].transform('max')

test['wpt_dist_max'] = test.groupby('wpt_name')['dist'].transform('max')
test['wpt_dist_max2'] = test.groupby('wpt_name')['dist2'].transform('max')

In [0]:
train['construct_record_min'] = train.groupby('construction_year')['date_recorded'].transform('min')
test['construct_record_min'] = test.groupby('construction_year')['date_recorded'].transform('min')

train['construct_record_max'] = train.groupby('construction_year')['date_recorded'].transform('max')
test['construct_record_max'] = test.groupby('construction_year')['date_recorded'].transform('max')

train['construct_record'] = (train.date_recorded - train.construction_year)
train['construct_record_2'] = (train.date_recorded - train.construction_year) / train.construction_year

test['construct_record'] = (test.date_recorded - test.construction_year)
test['construct_record_2'] = (test.date_recorded - test.construction_year) / test.construction_year

In [0]:
from sklearn.decomposition import PCA

def principal_component_analyze(df):

    extraction_type = df.extraction_type
    extraction_type_group = df.extraction_type_group
    extraction_type_class = df.extraction_type_class
    
    extraction = pd.DataFrame(data=list(zip(extraction_type, extraction_type_group, extraction_type_class)),
                  columns = ['extraction_type', 'extraction_type_group', 'extraction_type_class'])

    pca_extraction = PCA(n_components=1)
    extractpca = pca_extraction.fit_transform(extraction)
    df['extraction'] = extractpca
    print(pca_extraction.explained_variance_ratio_)
    
    
    management = df.management
    management_group = df.management_group
    
    manage = pd.DataFrame(data=list(zip(management, management_group)),
                  columns = ['management', 'management_group'])

    pca_manage = PCA(n_components=1)
    managementpca = pca_manage.fit_transform(manage)
    df['manage'] = managementpca
    print(pca_manage.explained_variance_ratio_)
    
    
    payment = df.payment
    payment_type = df.payment_type
    
    pay = pd.DataFrame(data=list(zip(payment, payment_type)),
                  columns = ['payment', 'payment_type'])

    pca_pay = PCA(n_components=1)
    paypca = pca_pay.fit_transform(pay)
    df['pay'] = paypca
    print(pca_pay.explained_variance_ratio_)
    
    
    water_quality = df.water_quality
    quality_group = df.quality_group
    
    quality = pd.DataFrame(data=list(zip(water_quality, quality_group)),
                  columns = ['water_quality', 'quality_group'])

    pca_qual = PCA(n_components=1)
    qualpca = pca_qual.fit_transform(quality)
    df['qual'] = qualpca
    print(pca_qual.explained_variance_ratio_)
    
    
    quantity = df.quantity
    quantity_group = df.quantity_group
    
    quant = pd.DataFrame(data=list(zip(quantity, quantity_group)),
                  columns = ['quantity', 'quantity_group'])

    pca_quant = PCA(n_components=1)
    quantpca = pca_quant.fit_transform(quant)
    df['manage'] = quantpca
    print(pca_quant.explained_variance_ratio_)
    
    
    source = df.source
    source_type = df.source_type
    
    ss = pd.DataFrame(data=list(zip(source, source_type)),
                  columns = ['source', 'source_type'])

    pca_ss = PCA(n_components=1)
    sspca = pca_ss.fit_transform(ss)
    df['ss'] = sspca
    print(pca_ss.explained_variance_ratio_)
    
    
    waterpoint_type = df.waterpoint_type
    waterpoint_type_group = df.waterpoint_type_group
    
    waterpoint = pd.DataFrame(data=list(zip(waterpoint_type, waterpoint_type_group)),
                  columns = ['waterpoint_type', 'waterpoint_type_group'])

    pca_waterpoint = PCA(n_components=1)
    waterpointpca = pca_waterpoint.fit_transform(waterpoint)
    df['waterpoint'] = waterpointpca
    print(pca_waterpoint.explained_variance_ratio_)
    
    
    #district_code = df.district_code
    #region_code = df.region_code
    
    #code = pd.DataFrame(data=list(zip(district_code, region_code)),
                  #columns = ['district_code', 'region_code'])

    #pca_code = PCA(n_components=1)
    #codepca = pca_code.fit_transform(code)
    #df['code'] = codepca
    #print(pca_code.explained_variance_ratio_)
    
    
    df = df.drop(['extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', #'district_code', 'region_code', 
                 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'waterpoint_type', 'waterpoint_type_group'], axis=1)
    
    return df
  
train = principal_component_analyze(train)
test = principal_component_analyze(test)

[0.95678236]
[0.96039072]
[1.]
[0.95462718]
[1.]
[0.98684648]
[0.98064153]
[0.89271392]
[0.95597614]
[0.9606198]
[1.]
[0.95550984]
[1.]
[0.98588773]
[0.98080023]
[0.89214725]


In [0]:
!pip3 install pygeohash
import pygeohash as gh



In [0]:
train['geohash'] = train.apply(lambda x: gh.encode(x.latitude, x.longitude, precision=2), axis=1)
test['geohash'] = test.apply(lambda x: gh.encode(x.latitude, x.longitude, precision=2), axis=1)

train.geohash = train.geohash.factorize()[0]
test.geohash = test.geohash.factorize()[0]

In [0]:
train['geohash2'] = train.apply(lambda y: gh.encode(y.latitude, y.longitude, precision=3), axis=1)
test['geohash2'] = test.apply(lambda y: gh.encode(y.latitude, y.longitude, precision=3), axis=1)

train.geohash2 = train.geohash2.factorize()[0]
test.geohash2 = test.geohash2.factorize()[0]

In [0]:
train['geohash3'] = train.apply(lambda z: gh.encode(z.latitude, z.longitude, precision=4), axis=1)
test['geohash3'] = test.apply(lambda z: gh.encode(z.latitude, z.longitude, precision=4), axis=1)

train.geohash3 = train.geohash3.factorize()[0]
test.geohash3 = test.geohash3.factorize()[0]

In [0]:
train = reduce_mem_usage(train)

Memory usage of dataframe is 6.06 MB
Memory usage after optimization is: 4.48 MB
Decreased by 26.2%


Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,dist,dist2,wpt_dist_min,wpt_dist_max,construct_record_min,construct_record_max,construct_record,geohash,geohash2,geohash3
0,6000.0,40616,0,1390,0,34.937500,-9.859375,0,0,0,0,0,11,5,0,0,109,1,0,0,1999.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1318.0,32864.0,0.0,1725.0,40591,41611,38624.0,0,0,0
1,0.0,41339,1,1399,1,34.687500,-2.148438,1,0,1,1,1,20,2,1,1,280,1,1,1,2010.0,0,0,0,1,0,1,1,0,0,1,1,1,1,1,0,0,1,1208.0,-7672.0,0.0,1719.0,40575,41611,39328.0,1,1,1
2,25.0,41330,2,686,2,37.468750,-3.822266,2,0,2,2,2,21,4,2,2,250,1,0,1,2009.0,0,0,0,0,0,2,2,0,0,0,0,2,2,1,1,0,1,1419.0,13264.0,1393.0,1419.0,37993,41611,39328.0,1,2,2
3,0.0,41302,3,263,3,38.500000,-11.156250,3,0,3,3,3,90,63,3,3,58,1,0,1,1986.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0,-1,1606.0,5240.0,1606.0,1606.0,40577,41520,39328.0,0,3,3
4,0.0,40737,4,0,4,31.125000,-1.825195,4,0,1,4,4,18,1,4,4,0,1,-1,1,1685.0,0,0,0,2,1,1,1,0,0,3,3,1,1,1,0,0,1,972.5,972.5,0.0,1734.0,40640,40884,39040.0,2,4,4
5,20.0,40615,5,0,5,39.187500,-4.765625,5,0,2,5,5,4,8,5,5,1,1,0,1,2009.0,1,1,1,0,0,2,2,1,1,0,0,4,4,2,1,0,1,1559.0,1559.0,1559.0,1559.0,37993,41611,38592.0,1,5,5
6,0.0,41183,6,0,6,33.375000,-3.765625,6,0,4,6,6,17,3,6,6,0,1,0,1,1842.0,2,2,2,0,0,1,1,0,0,0,0,3,3,0,2,1,-1,1128.0,1128.0,1128.0,1128.0,40611,41328,39328.0,2,6,6
7,0.0,41191,7,0,5,32.625000,-4.226562,7,0,5,7,6,17,3,7,7,0,1,-1,1,1877.0,3,3,2,1,0,3,3,2,2,0,0,5,5,0,2,1,-1,1082.0,1082.0,907.0,1664.0,40613,41333,39328.0,2,7,7
8,0.0,41216,8,0,7,32.718750,-5.148438,8,0,5,8,7,14,6,8,8,0,1,0,1,1863.0,4,4,2,0,0,1,1,1,1,3,3,3,3,0,2,1,-1,1098.0,1098.0,1098.0,1098.0,40613,41336,39360.0,2,7,8
9,0.0,40758,9,0,4,30.625000,-1.256836,9,0,1,9,4,18,1,4,9,0,1,-1,1,1706.0,3,3,2,0,0,1,1,0,0,0,0,5,5,0,2,1,1,939.5,939.5,939.5,939.5,40632,40884,39040.0,2,8,9


In [0]:
test = reduce_mem_usage(test)

Memory usage of dataframe is 1.50 MB
Memory usage after optimization is: 1.10 MB
Decreased by 26.4%


Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,dist,dist2,wpt_dist_min,wpt_dist_max,construct_record_min,construct_record_max,construct_record,geohash,geohash2,geohash3
0,0.0,41309,162,1996,340,35.281250,-4.058594,37400,0,4,10942,2,21,3,37,573,321,1,8,1,2012.0,5,5,3,8,3,1,1,0,0,3,3,1,1,1,3,2,1261.0,-12416.0,1261.0,1261.0,40626,41611,39296.0,0,0,0
1,0.0,41309,20,1569,5,36.656250,-3.308594,37401,0,2,19287,16,2,2,26,367,300,1,0,1,2000.0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,1355.0,-27248.0,1355.0,1355.0,40577,41611,39296.0,0,1,1
2,0.0,41306,-1,1567,-1,34.781250,-5.003906,21518,0,4,7343,18,13,2,32,647,500,1,0,1,2010.0,5,5,3,0,0,1,1,0,0,1,1,1,1,1,3,2,1235.0,31888.0,1235.0,1235.0,40575,41611,39296.0,0,2,2
3,0.0,41296,143,267,129,38.062500,-9.421875,37402,0,3,5578,14,80,43,105,1795,250,1,0,1,1987.0,5,5,3,0,0,3,3,0,0,2,2,5,5,0,3,2,1538.0,7292.0,1538.0,1538.0,40595,41367,39296.0,1,3,3
4,500.0,41360,1036,1260,1131,35.000000,-10.953125,2984,0,3,2889,9,10,3,97,653,60,1,4,1,2000.0,0,0,0,4,0,6,6,0,0,0,0,0,0,0,0,0,1345.0,16080.0,1345.0,1345.0,40577,41611,39360.0,1,4,4
5,0.0,41337,20,1685,5,36.687500,-3.302734,9429,0,2,7554,16,2,2,26,808,200,1,0,1,1990.0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1357.0,22528.0,1357.0,1357.0,40597,41611,39360.0,0,1,1
6,0.0,40604,20,550,156,36.406250,-7.542969,0,0,7,628,0,11,7,29,445,600,1,0,1,2007.0,4,4,2,0,0,1,1,1,1,0,0,3,3,0,2,1,1382.0,-23792.0,0.0,1608.0,40592,41581,38592.0,1,5,5
7,0.0,41299,111,234,102,39.593750,-10.890625,37403,0,3,767,3,9,4,101,1178,1,1,4,1,1982.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0,1687.0,-9096.0,1687.0,1687.0,40577,41398,39328.0,1,6,6
8,30.0,41297,3,584,25,39.250000,-10.820312,37404,0,3,432,3,90,33,93,880,40,1,0,0,1997.0,0,0,0,0,0,2,2,0,0,1,1,0,0,0,0,0,1658.0,15032.0,1658.0,1658.0,40593,41397,39296.0,1,7,7
9,0.0,41351,13,1083,10,37.093750,-3.251953,37405,0,2,16804,10,3,7,12,28,1,1,4,1,2003.0,0,0,0,4,0,6,6,0,0,0,0,0,0,0,0,0,1387.0,-5372.0,1387.0,1387.0,40577,41458,39360.0,0,1,8


In [0]:
train = train.drop('status_group', axis=1)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.0025, random_state=1, stratify=target, shuffle=True)

In [0]:
print('Train:', X_train.shape[0])
print('Valid:', X_valid.shape[0])

Train: 59251
Valid: 149


In [0]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [0]:
from sklearn.metrics import accuracy_score
y_pred_rf = rnd_clf.predict(X_valid)
print(f'Random Forest with Accuracy: {accuracy_score(y_valid, y_pred_rf):1.3f}')

Random Forest with Accuracy: 0.839


In [0]:
print('Feature Importances:')
for feature_name, feature_importance in zip(train.columns, rnd_clf.feature_importances_):
    print(feature_name, feature_importance)

Feature Importances:
amount_tsh 0.018246531136352297
date_recorded 0.03239384893881638
funder 0.02375972783910759
gps_height 0.035729931824608145
installer 0.019792022701542275
longitude 0.0329098215471627
latitude 0.049945347902682145
wpt_name 0.0403345291658929
num_private 0.0008319340767723997
basin 0.007783298976120276
subvillage 0.03837925262229764
region 0.010621573064504708
region_code 0.009295343509307783
district_code 0.011209624437328404
lga 0.016432009653296137
ward 0.025804646933243542
population 0.02597279100460797
public_meeting 0.003608169170041665
scheme_management 0.009473127122679456
permit 0.0040040827251608645
construction_year 0.043999362278342544
extraction_type 0.022920338412994436
extraction_type_group 0.020813010909879755
extraction_type_class 0.021394231247384404
management 0.009324633612921487
management_group 0.004160337170737156
payment 0.011248257612193908
payment_type 0.011495192403815848
water_quality 0.0054842988843221046
quality_group 0.005620799913370

In [0]:
from sklearn.ensemble import RandomForestClassifier
final_clf = RandomForestClassifier(n_estimators=6500, n_jobs=-1, random_state=42)
final_clf.fit(train, target)

In [0]:
print('Feature Importances:')
for feature_name, feature_importance in zip(train.columns, final_clf.feature_importances_):
    print(feature_name, feature_importance)

Feature Importances:
amount_tsh 0.018105206635242788
date_recorded 0.033418235886764215
funder 0.024024324845687693
gps_height 0.03451081021399807
installer 0.02028304747076881
longitude 0.03526950348893442
latitude 0.050687203403582355
wpt_name 0.03850778481148345
num_private 0.000851670676381109
basin 0.008346081451379796
subvillage 0.03781397310182541
region 0.011179789754216417
region_code 0.009790609609476317
district_code 0.012028081118256019
lga 0.017713154821475976
ward 0.027084910886358487
population 0.025531910095633503
public_meeting 0.003578493945605165
scheme_management 0.009506501334969929
permit 0.0041359762560087285
construction_year 0.04069285018629313
extraction_type 0.022042935867873283
extraction_type_group 0.020145876795949647
extraction_type_class 0.02414919612522541
management 0.009574907739968112
management_group 0.0042659832114210015
payment 0.011491017614623712
payment_type 0.011561580805222323
water_quality 0.0054026952936955626
quality_group 0.00535951681176

In [0]:
Y_pred = final_clf.predict(test)
submit = pd.DataFrame({
        "id": test_id.values,
        "status_group": Y_pred.ravel()
        })
submit.head(25)

Unnamed: 0,id,status_group
0,50785,-1
1,51630,1
2,17168,1
3,45559,-1
4,49871,1
5,52449,1
6,24806,1
7,28965,-1
8,36301,-1
9,54122,1


In [0]:
submit.status_group = submit.status_group.map({-1.0:"non functional", 0.0:"functional needs repair", 1.0:"functional"})
submit.head(25)

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
5,52449,functional
6,24806,functional
7,28965,non functional
8,36301,non functional
9,54122,functional


In [0]:
submit.to_csv('PUMPITUP_SUBMISSION_7.csv', index=False)
from google.colab import files
files.download('PUMPITUP_SUBMISSION_7.csv')