In [7]:
import numpy as np # support for multi-dimensional arrays and matrices
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

In [8]:
from sklearn.metrics import mean_absolute_error

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, VarianceThreshold

In [10]:
X = pd.read_csv('X.csv', index_col=0)
y = pd.read_csv('y.csv', header=None, index_col=0)
X_test = pd.read_csv('X_test.csv', index_col=0)

In [11]:
# new feature, 0 if cold (<300 Kelvin), 1 if warm

def is_warm(features):
    warm = []
    for observation in features['reanalysis_avg_temp_k']:
        if observation < 300:
            warm.append(0)
        else:
            warm.append(1)
    return warm

warmth = is_warm(X)
warmth_test = is_warm(X_test)

X['warmth'] = warmth
X_test['warmth'] = warmth_test

In [12]:
# remove constant columns (std = 0)
remove = []
for col in X.columns:
    if X[col].std() == 0:
        remove.append(col)

X.drop(remove, axis=1, inplace=True)
X_test.drop(remove, axis=1, inplace=True)


print(X.shape, X_test.shape)

(1456, 23) (416, 23)


In [13]:
import xgboost as xgb

In [14]:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV

#### Tuned Model

In [15]:
xgdmat = xgb.DMatrix(X, y)

In [85]:
params = {'eta': 0.01, 'seed':1, 'subsample': 0.8, 'colsample_bytree': 0.8, 
            'max_depth':6, 'min_child_weight':2, 'n_estimators': 100} 
# Grid Search CV optimized settings

cv_xgb = xgb.cv(params = params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5,
                metrics = ['rmse'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) # Look for early stopping that minimizes error

In [86]:
cv_xgb.tail(5)

Unnamed: 0,test-rmse-mean,test-rmse-std,train-rmse-mean,train-rmse-std
2159,17.070301,2.590781,0.844471,0.023097
2160,17.070374,2.590758,0.843436,0.022771
2161,17.070419,2.590831,0.842322,0.02262
2162,17.070367,2.590802,0.841245,0.022689
2163,17.070178,2.590633,0.840467,0.022633


In [88]:
final_gb = xgb.train(params, xgdmat, num_boost_round = 2163)

### Tuned Model Predictions

In [89]:
testdmat = xgb.DMatrix(X_test)

In [90]:
y_pred = final_gb.predict(testdmat)

In [91]:
for pred in y_pred:
    print(int(round(pred)))

6
5
7
12
9
15
12
16
22
18
18
23
36
31
57
55
38
70
73
81
75
40
36
56
36
29
37
32
36
31
33
19
15
19
10
21
15
21
16
20
12
14
11
14
7
5
4
1
3
3
3
6
8
4
5
3
10
7
11
13
9
35
39
44
54
52
62
65
57
72
78
40
63
70
71
78
77
66
55
32
27
37
40
30
22
20
24
27
22
20
22
19
14
22
17
12
14
10
3
8
11
8
4
7
12
13
7
15
15
28
35
42
27
33
54
45
40
32
52
90
78
81
85
63
84
77
76
70
91
69
51
36
38
26
23
14
14
15
10
16
14
20
17
20
16
14
12
11
10
4
5
3
4
7
7
4
5
7
4
5
8
18
15
24
39
43
26
37
26
47
31
62
61
60
55
56
61
31
46
71
63
35
25
46
26
23
30
25
18
36
13
16
21
19
19
17
11
14
9
10
9
6
5
5
6
6
4
5
4
9
6
12
7
9
16
23
38
36
35
21
29
26
42
41
62
59
19
52
75
73
79
78
60
60
67
73
53
34
43
24
24
16
10
18
19
12
14
18
13
10
11
8
8
7
4
6
2
8
4
6
6
2
5
5
3
4
7
5
3
4
4
2
6
5
42
5
13
9
7
11
11
10
9
6
12
9
4
24
11
14
17
17
16
15
16
14
10
7
7
9
4
7
5
7
8
3
4
5
4
4
4
3
5
2
2
4
2
4
5
3
2
2
3
5
6
12
7
8
11
11
7
21
15
7
14
14
29
22
7
17
18
21
16
17
17
20
16
9
8
4
5
9
3
4
5
10
5
2
7
4
3
3
3
3
5
3
3
5
7
2
4
3
9
5
3
2
5
8
7
14
9
11