In [67]:
import numpy as np # support for multi-dimensional arrays and matrices
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

In [68]:
from sklearn.metrics import mean_absolute_error

In [69]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, VarianceThreshold

In [70]:
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')
test_features = pd.read_csv('dengue_features_test.csv')

# filling in missing data

train_features.fillna(method='bfill', inplace=True)

test_features.fillna(method='bfill', inplace=True)

train_labels = train_labels.total_cases

# dropping the city feature and the other features previously used to describe time, now that I have a timestamp

train_features = train_features.drop(train_features.columns[[3]], axis=1)

test_features = test_features.drop(test_features.columns[[3]], axis=1)

In [71]:
cities = []
for city in train_features.city:
    if city == 'sj':
        cities.append(0)
    else:
        cities.append(1)

cities_t = []
for city in test_features.city:
    if city == 'sj':
        cities_t.append(0)
    else:
        cities_t.append(1)

In [72]:
train_features = train_features.drop(train_features.columns[[0]], axis=1)
test_features = test_features.drop(test_features.columns[[0]], axis=1)

train_features.city = cities
test_features.city = cities_t

In [73]:
X = train_features
y = train_labels
X_test = test_features

In [74]:
# remove constant columns (std = 0)
remove = []
for col in X.columns:
    if X[col].std() == 0:
        remove.append(col)

X.drop(remove, axis=1, inplace=True)
X_test.drop(remove, axis=1, inplace=True)


print(X.shape, X_test.shape)

(1456, 22) (416, 22)


In [78]:
Cols = X.columns.values.tolist()
clf = GradientBoostingRegressor(random_state = 8001)

selector = clf.fit(X, y)
importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
train = fs.transform(X)
test = fs.transform(X_test)
print(X.shape, X_test.shape)

(1456, 22) (416, 22)


In [79]:
selectedCols = X.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]
X = pd.DataFrame(X)
X_test = pd.DataFrame(X_test)
X.columns = sortedCols
X_test.columns = sortedCols

print(sortedCols[0:10])

['reanalysis_relative_humidity_percent', 'ndvi_sw', 'station_max_temp_c', 'reanalysis_sat_precip_amt_mm', 'station_min_temp_c', 'station_diur_temp_rng_c', 'precipitation_amt_mm', 'reanalysis_tdtr_k', 'station_precip_mm', 'reanalysis_dew_point_temp_k']


In [80]:
X = X.replace(np.inf, 999999)
X = X.replace(-np.inf, -999999)
X = X.replace(np.nan, -1)
X_test = X_test.replace(np.inf, 999999)
X_test = X_test.replace(-np.inf, -999999)
X_test = X_test.replace(np.nan, -1)

In [82]:
# Second round of gradient boosting
Cols = X.columns.values.tolist()
clf = GradientBoostingRegressor(random_state=1729)
selector = clf.fit(X, y)

importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
X = fs.transform(X)
X_test = fs.transform(X_test)
print(X.shape, X_test.shape)

selectedCols = X.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]

(1456, 4) (416, 4)


In [83]:
import xgboost as xgb

In [84]:
from sklearn.cross_validation import KFold

In [96]:
# Create an empty array for prediction
predictedResult = np.zeros(X.shape[0])

# Split dataset into k = 10 consecutive folds
# Each fold is used once as a validation while the k - 1 remaining folds form the training set
kf = KFold(X.shape[0], n_folds=5)

testPred = []

for trainIndex, testIndex in kf:
    trainFold, testFold = X[trainIndex], X[testIndex]
    trainFoldTarget, testFoldTarget = y[trainIndex], y[testIndex]
    
    xgbc = xgb.XGBRegressor(n_estimators = 100, # number of boosted trees
                             learning_rate = 0.1, # step size shrinkage used in update to prevent overfitting
                             max_depth = 5, # maximum depth of a tree
                             subsample = 0.6815, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.701) # subsample features
    
    xgbc.fit(trainFold, trainFoldTarget)
    xgbpred =xgbc.predict(testFold)

    testPred.append(xgbc.predict(X_test))
    predictedResult[testIndex] = xgbpred
    
    # Print the MA
    print(mean_absolute_error(testFoldTarget, xgbpred))

37.5286182651
22.0121278435
17.5375971847
14.2960431313
12.1549193009


In [97]:
print(mean_absolute_error(y, predictedResult))
testPred = np.average(np.array(testPred), axis =0)
#pd.DataFrame({"ID": test_id, "TARGET": testPred}).to_csv('submission.csv',index=False)

20.7174152365


In [98]:
for pred in testPred:
    print(int(round(pred)))

4
5
4
5
6
5
13
11
21
15
14
10
21
30
33
28
16
68
25
36
56
64
38
42
33
37
54
36
28
26
25
14
20
13
19
17
20
21
19
18
14
17
13
13
9
7
6
1
10
3
2
3
5
2
1
2
2
2
4
6
9
15
52
27
38
38
51
52
41
53
59
33
60
41
32
29
57
49
61
44
29
32
15
9
15
1
8
-1
11
19
15
19
14
15
12
9
13
4
3
4
4
2
2
1
1
4
3
3
12
19
23
34
6
10
32
31
15
16
47
66
46
61
40
30
56
46
52
47
43
37
53
25
37
18
17
15
7
13
9
16
16
19
19
17
18
14
11
11
10
4
4
3
3
6
3
2
3
4
5
2
3
3
5
5
10
19
8
14
17
26
19
43
53
31
42
48
42
30
24
52
53
34
37
53
22
18
23
15
37
16
10
4
20
19
24
16
12
15
12
12
8
9
4
12
11
4
2
2
1
2
3
1
3
4
9
21
22
28
15
12
15
17
38
23
42
50
15
46
47
27
59
102
68
30
32
45
25
16
18
14
13
13
8
15
15
16
19
12
15
19
12
8
7
6
4
2
2
3
3
1
6
3
4
12
6
4
5
9
4
12
22
8
10
17
31
21
123
39
13
13
14
11
14
14
11
9
1
12
17
14
14
14
12
16
14
12
12
7
5
6
3
3
4
5
4
2
6
5
4
3
5
3
3
4
4
7
5
5
9
6
6
8
9
17
12
12
14
15
33
11
13
9
15
9
10
12
15
1
1
16
13
16
15
23
16
12
10
10
9
6
7
5
2
3
6
4
4
3
4
2
4
4
3
4
6
3
4
7
5
5
7
9
7
10
7
8
10
12
18
19
21
15
