In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import train_test_split

  from numpy.core.umath_tests import inner1d


In [3]:
df_train=pd.read_csv('new_data/trainset_clean_dummy.csv')

In [4]:
df_train.shape

(59400, 110)

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Columns: 110 entries, id to waterpoint_type_group_other
dtypes: float64(2), int64(105), object(3)
memory usage: 49.9+ MB


In [6]:
print(df_train.columns)

Index(['id', 'existing year', 'gps_height', 'longitude', 'latitude',
       'num_private', 'population', 'status_group', 'installer_ces',
       'installer_community',
       ...
       'quantity_unknown', 'source_class_groundwater', 'source_class_surface',
       'source_class_unknown', 'waterpoint_type_group_cattle trough',
       'waterpoint_type_group_communal standpipe', 'waterpoint_type_group_dam',
       'waterpoint_type_group_hand pump',
       'waterpoint_type_group_improved spring', 'waterpoint_type_group_other'],
      dtype='object', length=110)


In [7]:
print(df_train.dtypes)

id                                            int64
existing year                                object
gps_height                                    int64
longitude                                   float64
latitude                                    float64
num_private                                   int64
population                                   object
status_group                                 object
installer_ces                                 int64
installer_community                           int64
installer_danida                              int64
installer_dwe                                 int64
installer_government                          int64
installer_hesawa                              int64
installer_individual                          int64
installer_kkkt                                int64
installer_other                               int64
installer_rwe                                 int64
installer_tcrs                                int64
installer_un

In [8]:
df_train['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [9]:
df_train['existing year'].value_counts()[:5]

unknown    20709
3.0         2740
1.0         2303
2.0         2129
5.0         1980
Name: existing year, dtype: int64

In [10]:
df_train['existing year'] = df_train['existing year'].replace({'unknown':0})

In [11]:
df_train['existing year'].value_counts()[:5]

0      20709
3.0     2740
1.0     2303
2.0     2129
5.0     1980
Name: existing year, dtype: int64

In [12]:
# df_train[['existing year','population']] = df_train[['existing year','population']].apply(pd.to_numeric)

In [13]:
df_train['existing year'] = df_train['existing year'].apply(pd.to_numeric)

In [14]:
print(df_train['existing year'].dtypes)

float64


In [15]:
df_train['population'].value_counts()[:5]

<built-in function zeros>    21381
1.0                           7025
200.0                         1940
150.0                         1892
250.0                         1681
Name: population, dtype: int64

In [16]:
df_train['population']=df_train['population'].replace({'<built-in function zeros>':0.0})

In [17]:
df_train['population'].value_counts()[:5]

0.0      21381
1.0       7025
200.0     1940
150.0     1892
250.0     1681
Name: population, dtype: int64

In [18]:
# df_train['population'] = df_train['population'].astype(float)
df_train['population'] = df_train['population'].apply(pd.to_numeric,errors='ignore')

In [19]:
# df_train['population'].value_counts()[:5]

In [20]:
print(df_train['population'].dtypes)

float64


In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Columns: 110 entries, id to waterpoint_type_group_other
dtypes: float64(4), int64(105), object(1)
memory usage: 49.9+ MB


In [22]:
# print(df_train.apply(lambda x: sum(x.isnull())))

### Split training datasets to training and validation sets 

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df_train.loc[:, df_train.columns != 'status_group'], df_train['status_group'], test_size=0.1, random_state=10)
print(len(X_train), len(X_test))

53460 5940


In [24]:
X_train[:5]

Unnamed: 0,id,existing year,gps_height,longitude,latitude,num_private,population,installer_ces,installer_community,installer_danida,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
6191,32154,25.0,147,37.798787,-7.378011,0,2000.0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
7705,6131,0.0,0,36.361102,-6.125086,0,0.0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
24710,34216,2.0,1806,35.581231,-3.757245,0,255.0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1464,2744,2.0,1269,37.886287,-4.390882,0,80.0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
44355,36293,0.0,0,33.412691,-2.986055,0,0.0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [25]:
y_train[:5]

6191     functional
7705     functional
24710    functional
1464     functional
44355    functional
Name: status_group, dtype: object

### Random Forest

- Train and Score models

In [209]:
X_test.min()[:5]

id               20.000000
existing year    -2.000000
gps_height      -55.000000
longitude         0.000000
latitude        -11.564324
dtype: float64

In [210]:
X_test.max()[:5]

id               7.422700e+04
existing year    5.300000e+01
gps_height       2.585000e+03
longitude        4.032340e+01
latitude        -2.000000e-08
dtype: float64

In [211]:
(X_train - X_train.min())[:5]

Unnamed: 0,id,existing year,gps_height,longitude,latitude,num_private,population,installer_ces,installer_community,installer_danida,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
6191,32154.0,32.0,237.0,37.798787,4.271429,0.0,2000.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7705,6131.0,7.0,90.0,36.361102,5.524355,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
24710,34216.0,9.0,1896.0,35.581231,7.892195,0.0,255.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1464,2744.0,9.0,1359.0,37.886287,7.258558,0.0,80.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
44355,36293.0,7.0,90.0,33.412691,8.663386,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [180]:
# Nomilisation
# X_test=(X_test - X_test.min())/(X_test.max() - X_test.min())
# X_train=(X_train - X_train.min())/(X_train.max() - X_train.min())

In [31]:
modelRFC = RandomForestClassifier(n_estimators=1000,min_samples_split=10,max_features=10,criterion='gini')

In [33]:
modelRFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
scoreRFC= modelRFC.score(X_test,y_test)

In [37]:
scoreRFC

0.8151515151515152

In [26]:
modelDTC = DecisionTreeClassifier()
modelDTC.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [27]:
scoreDTC= modelDTC.score(X_test,y_test)

In [32]:
modelETC = ExtraTreesClassifier(n_estimators=1000,min_samples_split=10,max_features=10)
modelETC.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=10, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [29]:
scoreETC= modelETC.score(X_test,y_test)

In [36]:
print('Random Forest' ,scoreRFC)
print('Decision Tree', scoreDTC)
print('Extra Trees Classifiers', scoreETC)

Random Forest 0.8151515151515152
Decision Tree 0.7395622895622895
Extra Trees Classifiers 0.8057239057239057


- Fit Model

In [38]:
df_test=pd.read_csv('new_data/testset_clean_dummy.csv')

In [39]:
df_test.shape

(14850, 108)

In [44]:
df_test.head(5)

Unnamed: 0,id,existing year,gps_height,longitude,latitude,num_private,population,installer_ces,installer_community,installer_danida,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,50785,1.0,1996,35.290799,-4.059696,0,321,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,51630,13.0,1569,36.656709,-3.309214,0,300,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,17168,3.0,1567,34.767863,-5.004344,0,500,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,45559,26.0,267,38.058046,-9.418672,0,250,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,49871,13.0,1260,35.006123,-10.950412,0,60,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [40]:
df_test.dtypes

id                                            int64
existing year                                object
gps_height                                    int64
longitude                                   float64
latitude                                    float64
num_private                                   int64
population                                    int64
installer_ces                                 int64
installer_community                           int64
installer_danida                              int64
installer_dwe                                 int64
installer_government                          int64
installer_hesawa                              int64
installer_individual                          int64
installer_kkkt                                int64
installer_other                               int64
installer_rwe                                 int64
installer_tcrs                                int64
installer_unknown                             int64
basin_Intern

In [50]:
df_test['status_group']=1.0

In [51]:
df_test.dtypes

id                                            int64
existing year                               float64
gps_height                                    int64
longitude                                   float64
latitude                                    float64
num_private                                   int64
population                                    int64
installer_ces                                 int64
installer_community                           int64
installer_danida                              int64
installer_dwe                                 int64
installer_government                          int64
installer_hesawa                              int64
installer_individual                          int64
installer_kkkt                                int64
installer_other                               int64
installer_rwe                                 int64
installer_tcrs                                int64
installer_unknown                             int64
basin_Intern

In [41]:
df_test['existing year'] = df_test['existing year'].replace({'unknown':0})
df_test['existing year'] = df_test['existing year'].apply(pd.to_numeric)

In [42]:
df_test['existing year'].dtypes

dtype('float64')

In [52]:
predict=modelRFC.predict(df_test)

In [53]:
predict[:5]

array(['non functional', 'non functional', 'functional', 'non functional',
       'non functional'], dtype=object)

In [54]:
df_test['status_group']=predict

In [55]:
df_test[['id','status_group']].head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,non functional
2,17168,functional
3,45559,non functional
4,49871,non functional


In [56]:
df_test[['id','status_group']].to_csv('outputRFC.csv',index=False)