In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
df_train=pd.read_csv('data/trainset_clean_dummy.csv')

In [4]:
df_train.shape

(59400, 108)

In [16]:
df_train.head()

Unnamed: 0,id,amount_tsh,days_since_recorded,gps_height,population,status_group,funder_danida,funder_government,funder_hesawa,funder_other,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,69572,6000.0,995,1390,109,functional,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
1,8776,0.0,272,1399,280,functional,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
2,34310,25.0,281,686,250,functional,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
3,67743,0.0,309,263,58,non functional,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
4,19728,0.0,874,0,0,functional,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0


In [20]:
df_train.dtypes

id                                            int64
amount_tsh                                  float64
days_since_recorded                           int64
gps_height                                    int64
population                                    int64
status_group                                 object
funder_danida                                 int64
funder_government                             int64
funder_hesawa                                 int64
funder_other                                  int64
funder_rwssp                                  int64
funder_world_bank                             int64
installer_commu                               int64
installer_danida                              int64
installer_dwe                                 int64
installer_government                          int64
installer_other                               int64
installer_rwe                                 int64
basin_Internal                                int64
basin_Lake N

In [7]:
df_train['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [22]:
df_train['population'].value_counts()[:5]

0      21381
1       7025
200     1940
150     1892
250     1681
Name: population, dtype: int64

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Columns: 108 entries, id to waterpoint_type_group_other
dtypes: float64(1), int64(106), object(1)
memory usage: 48.9+ MB


### Split training datasets to training and validation sets 

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df_train.loc[:, df_train.columns != 'status_group'], df_train['status_group'], test_size=0.1, random_state=10)
print(len(X_train), len(X_test))

53460 5940


In [26]:
X_train[:5]

Unnamed: 0,id,amount_tsh,days_since_recorded,gps_height,population,funder_danida,funder_government,funder_hesawa,funder_other,funder_rwssp,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
6191,32154,500.0,997,147,2000,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
7705,6131,0.0,999,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
24710,34216,10.0,316,1806,255,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
1464,2744,200.0,252,1269,80,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
44355,36293,0.0,867,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0


In [27]:
y_train[:5]

6191     functional
7705     functional
24710    functional
1464     functional
44355    functional
Name: status_group, dtype: object

- Train and Score models(Validation)

In [28]:
X_test.min()[:5]

id                     20.0
amount_tsh              0.0
days_since_recorded     0.0
gps_height            -55.0
population              0.0
dtype: float64

In [29]:
X_test.max()[:5]

id                      74227.0
amount_tsh             250000.0
days_since_recorded      3533.0
gps_height               2585.0
population              30500.0
dtype: float64

In [130]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(50,), alpha=1e-5,
                        max_iter=150, random_state=0, activation='logistic',
                        learning_rate_init=0.02,warm_start=True)
clf.fit(X_train, y_train)                         

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50,), learning_rate='constant',
       learning_rate_init=0.02, max_iter=150, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=True)

In [131]:
score_nn= clf.score(X_test,y_test)

In [132]:
score_nn

0.5651515151515152

In [133]:
print('Neural network', score_nn)

Neural network 0.5651515151515152


In [146]:
parameters = {'solver': ['lbfgs'], 'max_iter': [1000], 'alpha': 10.0 ** -np.arange(5, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3]}
clf_grid = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1)


In [147]:
clf_grid.fit(X_train, y_train)                         



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'max_iter': [1000], 'alpha': array([1.e-05, 1.e-06, 1.e-07, 1.e-08, 1.e-09]), 'hidden_layer_sizes': array([10, 11, 12, 13, 14]), 'random_state': [0, 1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [148]:
score_nn_grid= clf_grid.score(X_test,y_test)

In [149]:
score_nn_grid

0.5501683501683502

- Fit Model

In [81]:
df_test=pd.read_csv('data/testset_clean_dummy.csv')

In [82]:
df_test.shape

(14850, 107)

In [83]:
df_test.head(5)

Unnamed: 0,id,amount_tsh,days_since_recorded,gps_height,population,funder_danida,funder_government,funder_hesawa,funder_other,funder_rwssp,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,50785,0.0,302,1996,321,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
1,51630,0.0,302,1569,300,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,17168,0.0,305,1567,500,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
3,45559,0.0,315,267,250,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,49871,500.0,251,1260,60,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0


In [84]:
df_test.dtypes

id                                            int64
amount_tsh                                  float64
days_since_recorded                           int64
gps_height                                    int64
population                                    int64
funder_danida                                 int64
funder_government                             int64
funder_hesawa                                 int64
funder_other                                  int64
funder_rwssp                                  int64
funder_world_bank                             int64
installer_commu                               int64
installer_danida                              int64
installer_dwe                                 int64
installer_government                          int64
installer_other                               int64
installer_rwe                                 int64
basin_Internal                                int64
basin_Lake Nyasa                              int64
basin_Lake R

In [134]:
predict_nn=clf.predict(df_test)

In [135]:
predict_nn[:10]

array(['functional', 'functional', 'functional', 'non functional',
       'functional', 'functional', 'non functional', 'non functional',
       'functional', 'functional'], dtype='<U23')

In [140]:
df_test['status_group']=predict_nn
df_test[['id','status_group']].to_csv('neural_net_output.csv',index=False)