In [1]:
import numpy as np # support for multi-dimensional arrays and matrices
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.metrics import mean_absolute_error

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, VarianceThreshold

In [17]:
X = pd.read_csv('X.csv', index_col=0)
y = pd.read_csv('y.csv', header=None, index_col=0)
X_test = pd.read_csv('X_test.csv', index_col=0)

In [21]:
# new feature, 0 if cold (<300 Kelvin), 1 if warm

def is_warm(features):
    warm = []
    for observation in features['reanalysis_avg_temp_k']:
        if observation < 300:
            warm.append(0)
        else:
            warm.append(1)
    return warm

warmth = is_warm(X)
warmth_test = is_warm(X_test)

X['warmth'] = warmth
X_test['warmth'] = warmth_test

In [22]:
# remove constant columns (std = 0)
remove = []
for col in X.columns:
    if X[col].std() == 0:
        remove.append(col)

X.drop(remove, axis=1, inplace=True)
X_test.drop(remove, axis=1, inplace=True)


print(X.shape, X_test.shape)

(1456, 23) (416, 23)


#### Inspecting what a Boosting model selects as features

In [None]:
Cols = X.columns.values.tolist()
clf = GradientBoostingRegressor(random_state = 8001)

selector = clf.fit(X, y)
importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
X = fs.transform(X)
X_test = fs.transform(X_test)
print(train.shape, test.shape)

In [None]:
selectedCols = X.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]
X = pd.DataFrame(X)
X_test = pd.DataFrame(X_test)
X.columns = sortedCols
X_test.columns = sortedCols

print(sortedCols[0:10])

In [None]:
X = X.replace(np.inf, 999999)
X = X.replace(-np.inf, -999999)
X = X.replace(np.nan, -1)
X_test = X_test.replace(np.inf, 999999)
X_test = X_test.replace(-np.inf, -999999)
X_test = X_test.replace(np.nan, -1)

In [None]:
# Second round of gradient boosting
Cols = X.columns.values.tolist()
clf = GradientBoostingRegressor(random_state=1729)
selector = clf.fit(X, y)

importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
X = fs.transform(X)
X_test = fs.transform(X_test)
print(X.shape, X_test.shape)

selectedCols = X.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]

### Imports

In [8]:
import xgboost as xgb



In [9]:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV



### Grid Search 1

In [None]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 100, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1) 

In [None]:
optimized_GBM.fit(X, y)

In [None]:
optimized_GBM.grid_scores_

### Grid Search 2

In [None]:
cv_params = {'max_depth': [6,7,8,9], 'min_child_weight': [3,5,7]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 100, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1) 

In [None]:
optimized_GBM.fit(X, y)

In [None]:
optimized_GBM.grid_scores_

We pick max_depth: 7 and min_child_weight: 5
Next we vary n_estimators, subsample, and colsample_bytree

### Grid Search 3

In [None]:
cv_params = {'n_estimators': [75,100,200,300], 'subsample': [0.7,0.8,0.9], 'colsample_bytree': [0.7,0.8,0.9]}
ind_params = {'learning_rate': 0.1, 'min_child_weight': 5, 'seed':0, 'max_depth': 7}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1) 

In [None]:
optimized_GBM.fit(X, y)

In [None]:
optimized_GBM.grid_scores_

#### Tuned Model

In [23]:
xgdmat = xgb.DMatrix(X, y)

In [24]:
params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
            'max_depth':7, 'min_child_weight':5, 'n_estimators': 300} 
# Grid Search CV optimized settings

cv_xgb = xgb.cv(params = params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5,
                metrics = ['rmse'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) # Look for early stopping that minimizes error

In [25]:
cv_xgb.tail(5)

Unnamed: 0,test-rmse-mean,test-rmse-std,train-rmse-mean,train-rmse-std
257,18.613302,2.715311,0.502749,0.039431
258,18.61196,2.714799,0.496539,0.039945
259,18.61068,2.71418,0.490429,0.038934
260,18.610113,2.715391,0.484692,0.037856
261,18.610074,2.715261,0.478614,0.037918


In [26]:
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'max_depth': 7, 'min_child_weight': 5} 

final_gb = xgb.train(our_params, xgdmat, num_boost_round = 261)

### Tuned Model Predictions

In [27]:
testdmat = xgb.DMatrix(X_test)

In [28]:
y_pred = final_gb.predict(testdmat)

In [29]:
for pred in y_pred:
    print(int(round(pred)))

4
5
5
13
9
15
9
21
28
18
18
22
45
28
39
44
43
68
55
65
67
39
35
60
34
25
33
31
26
32
29
16
21
21
11
20
16
17
12
13
8
12
11
12
5
2
4
2
4
3
4
6
7
4
4
4
9
6
13
10
6
40
46
45
54
61
57
69
50
75
83
44
58
51
70
70
74
63
48
26
23
34
34
20
15
10
22
11
24
15
29
16
12
16
11
8
16
10
1
8
7
8
5
4
16
13
8
17
14
28
39
49
26
30
48
50
39
32
49
74
79
96
78
56
69
71
64
49
71
75
59
26
38
19
17
16
7
18
10
15
19
14
17
14
13
10
10
9
7
4
3
3
3
6
4
5
6
10
5
6
14
16
23
15
31
32
30
34
19
33
27
64
60
51
56
52
51
27
24
66
57
20
11
33
19
20
30
23
8
27
10
10
20
13
15
18
7
12
6
9
8
4
4
4
9
2
2
9
4
8
9
13
3
7
15
19
40
34
31
16
18
24
41
30
53
47
15
53
62
62
75
70
43
53
51
85
50
29
23
20
19
12
11
12
17
10
12
13
11
10
9
7
6
7
3
6
2
6
6
4
5
3
6
5
2
4
5
4
2
6
5
0
6
1
49
5
15
8
6
13
15
16
5
3
14
3
7
25
14
14
21
14
18
16
17
15
7
4
8
11
4
6
6
6
13
5
3
2
4
2
2
2
3
2
1
4
2
4
6
4
3
3
3
8
11
14
6
4
5
12
11
19
14
6
16
18
34
29
4
19
17
17
19
14
18
20
15
10
3
0
3
7
4
2
4
11
4
1
3
3
1
4
3
4
2
2
3
5
5
4
2
3
10
5
4
1
4
9
10
11
12
17
11


### Initial Model (cross validation, no tuning grid)

In [None]:
# Create an empty array for prediction
predictedResult = np.zeros(X.shape[0])

# Split dataset into k = 10 consecutive folds
# Each fold is used once as a validation while the k - 1 remaining folds form the training set
kf = KFold(X.shape[0], n_folds=5)

testPred = []

for trainIndex, testIndex in kf:
    trainFold, testFold = X[trainIndex], X[testIndex]
    trainFoldTarget, testFoldTarget = y[trainIndex], y[testIndex]
    
    xgbc = xgb.XGBRegressor(n_estimators = 300, # number of boosted trees
                             learning_rate = 0.1, # step size shrinkage used in update to prevent overfitting
                             max_depth = 7, # maximum depth of a tree
                             subsample = 0.8, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.8,
                           min_child_weight = 5) # subsample features
    
    xgbc.fit(trainFold, trainFoldTarget)
    xgbpred =xgbc.predict(testFold)

    testPred.append(xgbc.predict(X_test))
    predictedResult[testIndex] = xgbpred
    
    # Print the MA
    print(mean_absolute_error(testFoldTarget, xgbpred))

In [None]:
print(mean_absolute_error(y, predictedResult))
testPred = np.average(np.array(testPred), axis =0)
#pd.DataFrame({"ID": test_id, "TARGET": testPred}).to_csv('submission.csv',index=False)

In [None]:
for pred in testPred:
    print(int(round(pred)))