In [4]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [5]:
train = pd.read_csv('data/trainset_clean_dummy.csv')

In [6]:
train.head()

Unnamed: 0,id,existing year,gps_height,longitude,latitude,num_private,population,status_group,installer_ces,installer_community,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,69572,12.0,1390,34.938093,-9.856322,0,109.0,functional,0,0,...,0,1,0,0,0,1,0,0,0,0
1,8776,3.0,1399,34.698766,-2.147466,0,280.0,functional,0,0,...,0,0,1,0,0,1,0,0,0,0
2,34310,4.0,686,37.460664,-3.821329,0,250.0,functional,0,1,...,0,0,1,0,0,1,0,0,0,0
3,67743,27.0,263,38.486161,-11.155298,0,58.0,non functional,0,0,...,0,1,0,0,0,1,0,0,0,0
4,19728,unknown,0,31.130847,-1.825359,0,<built-in function zeros>,functional,0,0,...,0,0,1,0,0,1,0,0,0,0


In [20]:
train.dtypes

id                                            int64
existing year                                object
gps_height                                    int64
longitude                                   float64
latitude                                    float64
num_private                                   int64
population                                   object
status_group                                 object
installer_ces                                 int64
installer_community                           int64
installer_danida                              int64
installer_dwe                                 int64
installer_government                          int64
installer_hesawa                              int64
installer_individual                          int64
installer_kkkt                                int64
installer_other                               int64
installer_rwe                                 int64
installer_tcrs                                int64
installer_un

In [23]:
train['population'].apply(pd.to_numeric)

0         109.0
1         280.0
2         250.0
3          58.0
4           0.0
5           1.0
6           0.0
7           0.0
8           0.0
9           0.0
10        345.0
11        250.0
12          0.0
13          1.0
14          0.0
15        200.0
16         35.0
17         50.0
18       1000.0
19          1.0
20          4.0
21          0.0
22        350.0
23        210.0
24        156.0
25        140.0
26        260.0
27          0.0
28          1.0
29          1.0
          ...  
59370       1.0
59371       0.0
59372       0.0
59373      96.0
59374       0.0
59375     609.0
59376       1.0
59377       0.0
59378      36.0
59379       0.0
59380      50.0
59381     360.0
59382       1.0
59383     800.0
59384       0.0
59385     200.0
59386    1000.0
59387     100.0
59388     500.0
59389    1500.0
59390     150.0
59391     210.0
59392       0.0
59393       0.0
59394      89.0
59395     125.0
59396      56.0
59397       0.0
59398       0.0
59399     150.0
Name: population, Length

In [24]:
train['existing year'].replace({'unknown':0}, inplace=True)

In [25]:
train['existing year'].apply(pd.to_numeric)

0        12.0
1         3.0
2         4.0
3        27.0
4         0.0
5         2.0
6         0.0
7         0.0
8         0.0
9         0.0
10        0.0
11       26.0
12        0.0
13        4.0
14        0.0
15       20.0
16       33.0
17       33.0
18       12.0
19       19.0
20        3.0
21        0.0
22       33.0
23        2.0
24        4.0
25       39.0
26        2.0
27        0.0
28       13.0
29       11.0
         ... 
59370     0.0
59371     0.0
59372     0.0
59373    25.0
59374     0.0
59375     6.0
59376    23.0
59377     0.0
59378     3.0
59379     0.0
59380    12.0
59381    19.0
59382     5.0
59383     5.0
59384     0.0
59385     1.0
59386     2.0
59387    13.0
59388    18.0
59389     2.0
59390    20.0
59391    46.0
59392     0.0
59393     0.0
59394     4.0
59395    14.0
59396    15.0
59397     0.0
59398     0.0
59399     9.0
Name: existing year, Length: 59400, dtype: float64

In [26]:
train['population'].value_counts()

0.0        21381
1.0         7025
200.0       1940
150.0       1892
250.0       1681
300.0       1476
100.0       1146
50.0        1139
500.0       1009
350.0        986
120.0        916
400.0        775
60.0         706
30.0         626
40.0         552
80.0         533
450.0        499
20.0         462
600.0        438
230.0        388
75.0         289
1000.0       278
800.0        269
90.0         265
130.0        264
25.0         255
320.0        249
35.0         245
360.0        222
140.0        215
           ...  
3250.0         1
895.0          1
693.0          1
11463.0        1
1221.0         1
1183.0         1
1025.0         1
538.0          1
1203.0         1
819.0          1
5300.0         1
1740.0         1
571.0          1
788.0          1
376.0          1
468.0          1
1306.0         1
3982.0         1
593.0          1
2813.0         1
2145.0         1
4208.0         1
1034.0         1
3241.0         1
3568.0         1
197.0          1
1274.0         1
966.0         

In [15]:
test = pd.read_csv('data/testset_clean_dummy.csv')

In [16]:
test.head()

Unnamed: 0,id,existing year,gps_height,longitude,latitude,num_private,population,installer_ces,installer_community,installer_danida,...,quantity_unknown,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,50785,1.0,1996,35.290799,-4.059696,0,321,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,51630,13.0,1569,36.656709,-3.309214,0,300,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,17168,3.0,1567,34.767863,-5.004344,0,500,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,45559,26.0,267,38.058046,-9.418672,0,250,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,49871,13.0,1260,35.006123,-10.950412,0,60,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [27]:
test.dtypes

id                                            int64
existing year                                object
gps_height                                    int64
longitude                                   float64
latitude                                    float64
num_private                                   int64
population                                    int64
installer_ces                                 int64
installer_community                           int64
installer_danida                              int64
installer_dwe                                 int64
installer_government                          int64
installer_hesawa                              int64
installer_individual                          int64
installer_kkkt                                int64
installer_other                               int64
installer_rwe                                 int64
installer_tcrs                                int64
installer_unknown                             int64
basin_Intern

In [29]:
test['existing year'].replace({'unknown':0}, inplace=True)

In [30]:
test['existing year'].apply(pd.to_numeric)

0         1.0
1        13.0
2         3.0
3        26.0
4        13.0
5        23.0
6         4.0
7        31.0
8        16.0
9        10.0
10        7.0
11       11.0
12        0.0
13        0.0
14       27.0
15       24.0
16        0.0
17       33.0
18        0.0
19       31.0
20        3.0
21        3.0
22        8.0
23       43.0
24        0.0
25        3.0
26        4.0
27        0.0
28        0.0
29       11.0
         ... 
14820    12.0
14821     6.0
14822    16.0
14823     0.0
14824     3.0
14825    35.0
14826     0.0
14827     0.0
14828     0.0
14829     8.0
14830    17.0
14831     0.0
14832    12.0
14833     8.0
14834     0.0
14835     0.0
14836     2.0
14837     3.0
14838    25.0
14839     6.0
14840     4.0
14841     0.0
14842     4.0
14843    18.0
14844     0.0
14845    23.0
14846    17.0
14847     3.0
14848     4.0
14849     5.0
Name: existing year, Length: 14850, dtype: float64

In [31]:
test['population'].value_counts()

0       5453
1       1757
150      436
200      430
250      406
300      366
50       298
100      273
350      266
500      265
120      219
400      207
30       153
60       150
80       149
40       132
450      117
600      113
230      102
20        96
90        81
800       78
130       67
35        66
25        65
320       62
75        62
70        61
15        58
1000      58
        ... 
956        1
1020       1
1140       1
1228       1
1420       1
444        1
2445       1
491        1
7600       1
2554       1
507        1
571        1
691        1
715        1
1655       1
859        1
7200       1
1115       1
1483       1
396        1
1523       1
3770       1
148        1
172        1
196        1
244        1
252        1
284        1
2365       1
7000       1
Name: population, Length: 637, dtype: int64

Split Training Set and Test Set

In [32]:
target = train['status_group']
features = train.drop('status_group', axis=1)
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2)

In [33]:
def SVCmodel(X_train, X_val, y_train, y_val, test):
    if __name__ == '__main__':

        pipe_svc = Pipeline([('scl', StandardScaler()),
                             ('clf', LinearSVC())])
    
        param_grid = {'clf__C':[0.001, 0.01, 0.1, 1.0],
                      'clf__class_weight':[None, 'balanced']}

        estimator = GridSearchCV(estimator=pipe_svc,
                                 param_grid=param_grid,
                                 n_jobs=-1)

        estimator.fit(X_train, y_train)

        best_params = estimator.best_params_
                                 
        validation_accuracy = estimator.score(X_val, y_val)
        print('Validation accuracy: ', validation_accuracy)
        print(best_params)

In [34]:
SVCmodel(X_train, X_val, y_train, y_val, test)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Validation accuracy:  0.7326599326599327
{'clf__C': 0.001, 'clf__class_weight': None}


  Xt = transform.transform(Xt)


In [37]:
def model_for_submission(features, target, test):
    if __name__ == '__main__':

        pipe_svc = Pipeline([('scl', StandardScaler()),
                             ('clf', LinearSVC())])
    
        param_grid = {'clf__C':[0.001],
                      'clf__class_weight':[None]}

        estimator = GridSearchCV(estimator=pipe_svc,
                                 param_grid=param_grid,
                                 n_jobs=-1)

        estimator.fit(features, target)        

        predictions = estimator.predict(test)

        data = {'id': test.id, 'status_group': predictions}

        submit = pd.DataFrame(data=data)

        vals_to_replace = {2:'functional', 1:'functional needs repair',
                           0:'non functional'}

        submit.status_group = submit.status_group.replace(vals_to_replace)        

        submit.to_csv('data/svm_predictions.csv', index=False)

In [38]:
model_for_submission(features, target, test)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


ValueError: operands could not be broadcast together with shapes (14850,108) (109,) (14850,108) 