In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [2]:
train = pd.read_csv(r'C:\Users\mi\MLcourse\mlcourse.ai\data\forest-cover-type-prediction\train.csv',
                   index_col='Id')
test = pd.read_csv(r'C:\Users\mi\MLcourse\mlcourse.ai\data\forest-cover-type-prediction\test.csv',
                  index_col='Id')

In [3]:
train.head(1).T

Id,1
Elevation,2596
Aspect,51
Slope,3
Horizontal_Distance_To_Hydrology,258
Vertical_Distance_To_Hydrology,0
Horizontal_Distance_To_Roadways,510
Hillshade_9am,221
Hillshade_Noon,232
Hillshade_3pm,148
Horizontal_Distance_To_Fire_Points,6279


In [4]:
train['Cover_Type'].value_counts()

7    2160
6    2160
5    2160
4    2160
3    2160
2    2160
1    2160
Name: Cover_Type, dtype: int64

In [5]:
def write_to_submission_file(predicted_labels, out_file,
                             target='Cover_Type', index_label="Id", init_index=15121):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(init_index, 
                                                  predicted_labels.shape[0] + init_index),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

#### Perform train-test split

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train.drop('Cover_Type', axis=1), train['Cover_Type'],
    test_size=0.3, random_state=17)

#### Test logistic regression

In [7]:
logit = LogisticRegression(C=1, solver='lbfgs', max_iter=500,
                           random_state=17, n_jobs=4,
                          multi_class='multinomial')
logit_pipe = Pipeline([('scaler', StandardScaler()), 
                       ('logit', logit)])

In [8]:
%%time
logit_pipe.fit(X_train, y_train)

Wall time: 5.47 s


Pipeline(steps=[('scaler', StandardScaler()),
                ('logit',
                 LogisticRegression(C=1, max_iter=500,
                                    multi_class='multinomial', n_jobs=4,
                                    random_state=17))])

In [9]:
logit_val_pred = logit_pipe.predict(X_valid)

In [10]:
accuracy_score(y_valid, logit_val_pred)

0.7067901234567902

In [11]:
first_forest = RandomForestClassifier(
    n_estimators=100, random_state=17, n_jobs=4)

In [12]:
%%time
first_forest.fit(X_train, y_train)

Wall time: 414 ms


RandomForestClassifier(n_jobs=4, random_state=17)

In [13]:
forest_val_pred = first_forest.predict(X_valid)

In [14]:
accuracy_score(y_valid, forest_val_pred)

0.8602292768959435

In [15]:
pd.DataFrame(first_forest.feature_importances_,
             index=X_train.columns, columns=['Importance']).sort_values(
    by='Importance', ascending=False)[:10]

Unnamed: 0,Importance
Elevation,0.221297
Horizontal_Distance_To_Roadways,0.093678
Horizontal_Distance_To_Fire_Points,0.073004
Horizontal_Distance_To_Hydrology,0.062592
Hillshade_9am,0.052744
Vertical_Distance_To_Hydrology,0.052035
Aspect,0.050237
Hillshade_3pm,0.047294
Hillshade_Noon,0.045997
Wilderness_Area4,0.038577


In [16]:
lgb_clf = LGBMClassifier(random_state=17)

In [17]:
%%time
lgb_clf.fit(X_train, y_train)

Wall time: 722 ms


LGBMClassifier(random_state=17)

In [18]:
accuracy_score(y_valid, lgb_clf.predict(X_valid))

0.8591269841269841

#### 1 stage of hyper-param tuning: tuning model complexity

In [19]:
param_grid = {'num_leaves': [7, 15, 31, 63], 
              'max_depth': [3, 4, 5, 6, -1]}

In [20]:
grid_searcher = GridSearchCV(estimator=lgb_clf, param_grid=param_grid, 
                             cv=5, verbose=1, n_jobs=4)

In [21]:
grid_searcher.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   33.3s finished


GridSearchCV(cv=5, estimator=LGBMClassifier(random_state=17), n_jobs=4,
             param_grid={'max_depth': [3, 4, 5, 6, -1],
                         'num_leaves': [7, 15, 31, 63]},
             verbose=1)

In [22]:
grid_searcher.best_params_, grid_searcher.best_score_

({'max_depth': -1, 'num_leaves': 63}, 0.8546862512757916)

In [23]:
accuracy_score(y_valid, grid_searcher.predict(X_valid))

0.8604497354497355

#### 2 stage of hyper-param tuning: convergence:

In [24]:
num_iterations = 200
lgb_clf2 = LGBMClassifier(random_state=17, max_depth=-1, 
                          num_leaves=63, n_estimators=num_iterations,
                          n_jobs=1)

param_grid2 = {'learning_rate': np.logspace(-3, 0, 10)}
grid_searcher2 = GridSearchCV(estimator=lgb_clf2, param_grid=param_grid2,
                               cv=5, verbose=1, n_jobs=4)
grid_searcher2.fit(X_train, y_train)
print(grid_searcher2.best_params_, grid_searcher2.best_score_)
print(accuracy_score(y_valid, grid_searcher2.predict(X_valid)))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  1.2min finished


{'learning_rate': 0.21544346900318823} 0.8604495250885575
0.8692680776014109


In [25]:
final_lgb = LGBMClassifier(n_estimators=200, num_leaves=63,
                           learning_rate=0.2, max_depth=-1,
                         n_jobs=4)

In [26]:
%%time
final_lgb.fit(train.drop('Cover_Type', axis=1), train['Cover_Type'])

Wall time: 3.18 s


LGBMClassifier(learning_rate=0.2, n_estimators=200, n_jobs=4, num_leaves=63)

In [27]:
%%time
lgb_final_pred = final_lgb.predict(test)

Wall time: 36.3 s


In [28]:
write_to_submission_file(lgb_final_pred, 
                         'lgb_forest_cover_type.csv')

#### Kaggle Public LB accuracy - 0.76851.

#### Feature importance:

In [29]:
pd.DataFrame(final_lgb.feature_importances_,
             index=X_train.columns, columns=['Importance']).sort_values(
    by='Importance', ascending=False)[:10]

Unnamed: 0,Importance
Horizontal_Distance_To_Roadways,11804
Horizontal_Distance_To_Fire_Points,11792
Elevation,11222
Vertical_Distance_To_Hydrology,7161
Horizontal_Distance_To_Hydrology,6829
Aspect,5757
Hillshade_Noon,5457
Hillshade_3pm,5184
Hillshade_9am,5135
Slope,3802
