In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA
import category_encoders as ce
from heamy.dataset import Dataset
from heamy.estimator import Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.preprocessing import normalize
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
CACHE=False
ID = 'Id'
TARGET = 'Cover_Type'
NFOLDS = 5
SEED = 4
np.random.seed(SEED)

In [3]:
rf_params = {
    'n_estimators': 400,
    'criterion': 'entropy',
    'random_state': 4
}

rf1_params = {
    'n_estimators': 400,
    'criterion': 'gini',
    'random_state': 4
}

et1_params = {
    'n_estimators': 400,
    'criterion': 'gini',
    'random_state': 4
}

et_params = {
    'n_estimators': 500,
    'criterion': 'entropy',
    'random_state': 4
}

et2_params = {
    'n_estimators': 600,
    'criterion': 'gini',
    'random_state': 4
}

lgb_params = {
    'n_estimators': 700, 
    'learning_rate':0.1
}

rf2_params = {
    'n_estimators': 200,
    'criterion': 'entropy',
    'random_state': 0
}

rf3_params = {
    'n_estimators': 200,
    'criterion': 'gini',
    'random_state': 0
}

et3_params = {
    'n_estimators': 200,
    'criterion': 'gini',
    'random_state': 0
}

et4_params = {
    'n_estimators': 200,
    'criterion': 'entropy',
    'random_state': 0
}

lgb1_params = {
    'n_estimators': 200, 
    'learning_rate':0.1
}

logr_params = {
        'solver' : 'liblinear',
        'multi_class' : 'ovr',
        'C': 1,
        'random_state': 0
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'subsample': 0.6,
    'learning_rate': 0.05,
    'objective': 'multi:softprob',
    'num_class': 7,        
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mlogloss',
}

xg_params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'learning_rate': 0.1,
        'objective': 'multi:softprob',   
        'num_class': 7,
        'max_depth': 4,
        'min_child_weight': 1,
        'eval_metric': 'mlogloss',
        'nrounds': 200
}    

In [4]:
def make_submission(predictions):
    submission = pd.read_csv('sampleSubmission.csv')
    submission['Cover_Type'] = predictions
    submission.to_csv('submission.csv', index=None)

In [5]:
def xgb_first(X_train, y_train, X_test, y_test=None):
    X_train = xgb.DMatrix(X_train, label=y_train)
    model = xgb.train(xg_params, X_train, xg_params['nrounds'])
    return model.predict(xgb.DMatrix(X_test))

In [6]:
ELU_CODE = {
    1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
    10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
    18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
    26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
    34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
}

In [7]:
def climatic_zone(input_df):
    df = input_df.copy()
    df['Climatic_Zone'] = input_df['Soil_Type'].apply(
        lambda x: int(str(ELU_CODE[x])[0])
    )
    return df

In [8]:
def add_pca_features(train, test):
    train_temp = train[['Horizontal_Distance_To_Roadways' , 'Elevation', 'Horizontal_Distance_To_Fire_Points']]
    test_temp = test[['Horizontal_Distance_To_Roadways' , 'Elevation', 'Horizontal_Distance_To_Fire_Points']]
    train_temp_scaled = (train_temp - train_temp.mean(axis=0)) / train_temp.std(axis=0)
    test_temp_scaled = (test_temp - train_temp.mean(axis=0)) / train_temp.std(axis=0)
    pca = PCA(n_components=2)
    train_temp_pca = pca.fit_transform(train_temp_scaled)
    test_temp_pca = pca.transform(test_temp_scaled)
    train_temp_pca = pd.DataFrame(train_temp_pca, columns=['Roads_Fire_Elevation_pca_1', 'Roads_Fire_Elevation_pca_2'], index=train.index)
    test_temp_pca = pd.DataFrame(test_temp_pca, columns=['Roads_Fire_Elevation_pca_1', 'Roads_Fire_Elevation_pca_2'], index=test.index)
    train = pd.concat([train, train_temp_pca], axis=1)
    test = pd.concat([test, test_temp_pca], axis=1)
    
    train_temp = train[['Elevation', 'Horizontal_Distance_To_Fire_Points']]
    test_temp = test[['Elevation', 'Horizontal_Distance_To_Fire_Points']]
    train_temp_scaled = (train_temp - train_temp.mean(axis=0)) / train_temp.std(axis=0)
    test_temp_scaled = (test_temp - train_temp.mean(axis=0)) / train_temp.std(axis=0)
    pca = PCA(n_components=2)
    train_temp_pca = pca.fit_transform(train_temp_scaled)
    test_temp_pca = pca.transform(test_temp_scaled)
    train_temp_pca = pd.DataFrame(train_temp_pca, columns=['Fire_Elevation_pca_1', 'Fire_Elevation_pca_2'], index=train.index)
    test_temp_pca = pd.DataFrame(test_temp_pca, columns=['Fire_Elevation_pca_1', 'Fire_Elevation_pca_2'], index=test.index)
    train = pd.concat([train, train_temp_pca], axis=1)
    test = pd.concat([test, test_temp_pca], axis=1)
    
    train_temp = train[['Horizontal_Distance_To_Roadways' , 'Horizontal_Distance_To_Fire_Points']]
    test_temp = test[['Horizontal_Distance_To_Roadways' , 'Horizontal_Distance_To_Fire_Points']]
    train_temp_scaled = (train_temp - train_temp.mean(axis=0)) / train_temp.std(axis=0)
    test_temp_scaled = (test_temp - train_temp.mean(axis=0)) / train_temp.std(axis=0)
    pca = PCA(n_components=2)
    train_temp_pca = pca.fit_transform(train_temp_scaled)
    test_temp_pca = pca.transform(test_temp_scaled)
    train_temp_pca = pd.DataFrame(train_temp_pca, columns=['Roads_Fire_pca_1', 'Roads_Fire_pca_2'], index=train.index)
    test_temp_pca = pd.DataFrame(test_temp_pca, columns=['Roads_Fire_pca_1', 'Roads_Fire_pca_2'], index=test.index)
    train = pd.concat([train, train_temp_pca], axis=1)
    test = pd.concat([test, test_temp_pca], axis=1)
    return train, test

In [9]:
def preprocess1(df):
    df['Ele_minus_VDtHyd'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df['Hydro_plus_Fire'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_minus_Fire'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_plus_Road'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways']
    df['Hydro_minus_Road'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways']
    df['Fire_plus_Road'] = df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways']
    df['Fire_minus_Road'] = df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways']
    df["Elevation_minus_Road_02"] = df["Elevation"] - .02*df["Horizontal_Distance_To_Roadways"]
    df["Elevation_hd"] = df["Elevation"] - .2*df["Horizontal_Distance_To_Hydrology"]
    df['Hillshade'] = df['Hillshade_9am'] + df['Hillshade_3pm'] + df['Hillshade_Noon']
    df['Soil_Type'] = 0
    for i in range(1,41):
        df['Soil_Type'] += i * df[f'Soil_Type{i}']
    df = climatic_zone(df)
    df['Soil_Type'] = df['Soil_Type'].astype('str') 
    df['Vertical_Distance_To_Hydrology_sqrt'] = np.sqrt(np.abs(df['Vertical_Distance_To_Hydrology']))
    df['Horizontal_Distance_To_Hydrology_sqrt'] = np.sqrt(np.abs(df['Horizontal_Distance_To_Hydrology']))
    df['Horizontal_Distance_To_Fire_Points_log'] = np.log(1 + np.abs(df['Horizontal_Distance_To_Fire_Points']))
    df['Horizontal_Distance_To_Roadways_log'] = np.log(1 + np.abs(df['Horizontal_Distance_To_Roadways']))
    df['Wilderness_Area'] = 0
    for i in range(1, 5):
        df['Wilderness_Area'] += i * df[f'Wilderness_Area{i}']
    
    df['Wilderness_Area'] = df['Wilderness_Area'].astype('str') 
    
    df.drop(['Hillshade_3pm'], axis=1, inplace=True)
    df.drop(['Vertical_Distance_To_Hydrology'], axis=1, inplace=True)
    df.drop(columns = [f'Soil_Type{i}' for i in range(1, 41)], inplace = True)
    df.drop(columns = [f'Wilderness_Area{i}' for i in range(1, 5)], inplace = True)
    return df

In [10]:
def add_target_enc_features(train, test):
    enc = ce.OneHotEncoder().fit(train['Cover_Type'].astype(str))
    y_onehot = enc.transform(train['Cover_Type'].astype(str))
    class_names = y_onehot.columns
    train_obj = train.select_dtypes('object')
    test_obj = test.select_dtypes('object')
    train = train.select_dtypes(exclude='object')
    test = test.select_dtypes(exclude='object')
    for class_ in class_names:
        target_encoder = ce.TargetEncoder(smoothing=0)
        target_encoder.fit(train_obj, y_onehot[class_])
        temp_train = target_encoder.transform(train_obj)
        temp_test = target_encoder.transform(test_obj)
        temp_train.columns=[str(x)+'_'+str(class_) for x in temp_train.columns]
        temp_test.columns = [str(x) + '_' + str(class_) for x in temp_test.columns]
        train = pd.concat([train, temp_train], axis=1)
        test = pd.concat([test, temp_test], axis=1)   
    return train, test

In [11]:
def preprocess_dataset1():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")

    y_train = train[TARGET].ravel() - 1

    train.drop([ID], axis=1, inplace=True)
    test.drop([ID], axis=1, inplace=True)
    
    train = preprocess1(train)
    test = preprocess1(test)
    train, test = add_pca_features(train, test)
    train, test = add_target_enc_features(train, test)    
    train.drop([TARGET], axis=1, inplace=True)  

    x_train = train.values
    x_test = test.values

    return {'X_train': x_train, 'X_test': x_test, 'y_train': y_train}

In [12]:
def preprocess2(df):
    df['Hydro_plus_Fire'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_minus_Fire'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_plus_Road'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways']
    df['Hydro_minus_Road'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways']
    df['Fire_plus_Road'] = df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways']
    df['Fire_minus_Road'] = df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways']
    df['Ele_plus_VDtHyd'] = df['Elevation'] + df['Vertical_Distance_To_Hydrology']
    df['Ele_minus_VDtHyd'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df['Hydro_plus_Fire_avg'] = df['Hydro_plus_Fire'] / 2
    df['Hydro_minus_Fire_avg'] = df['Hydro_minus_Fire'] / 2
    df['Hydro_plus_Road_avg'] = df['Hydro_plus_Road'] / 2
    df['Hydro_minus_Road_avg'] = df['Hydro_minus_Road'] / 2
    df['Fire_plus_Road_avg'] = df['Fire_plus_Road'] / 2
    df['Fire_minus_Road_avg'] = df['Fire_minus_Road'] / 2
    df['Ele_plus_VDtHyd_avg'] = df['Ele_plus_VDtHyd'] / 2
    df['Ele_minus_VDtHyd_avg'] = df['Ele_minus_VDtHyd'] / 2
    df['slope_hyd_sqrt'] = (df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5
    df['slope_hyd_sqrt'] = df['slope_hyd_sqrt'].map(lambda x: 0 if np.isinf(x) else x)
    df['Amenities_avg'] = (df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways']) / 3 
    df['Shadiness_morn_noon'] = df['Hillshade_9am'] / (df['Hillshade_Noon'] + 1)
    df['Shadiness_noon_3pm'] = df['Hillshade_Noon'] / (df['Hillshade_3pm'] + 1)
    df['Shadiness_morn_3'] = df['Hillshade_9am'] / (df['Hillshade_3pm'] + 1)
    df['Shadiness_morn_avg'] = (df['Hillshade_9am'] + df['Hillshade_Noon']) / 2
    df['Shadiness_afternoon'] = (df['Hillshade_Noon'] + df['Hillshade_3pm']) / 2
    df['Shadiness_mean_hillshade'] =  (df['Hillshade_9am'] + df['Hillshade_Noon'] + df['Hillshade_3pm'] ) / 3    
    df["Hillshade-9_Noon_diff"] = df["Hillshade_9am"] - df["Hillshade_Noon"]
    df["Hillshade-noon_3pm_diff"] = df["Hillshade_Noon"] - df["Hillshade_3pm"]
    df["Hillshade-9am_3pm_diff"] = df["Hillshade_9am"] - df["Hillshade_3pm"]
    df["SlopeElevation"] = df["Slope"] * df["Elevation"]
    df["Vertical_Distance_To_Hydrology"] = abs(df['Vertical_Distance_To_Hydrology'])
    df['Neg_Elev_Hyd'] = df.Elevation-df.Horizontal_Distance_To_Hydrology*0.2

    return df

In [13]:
def preprocess_dataset2():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")

    y_train = train[TARGET].ravel() - 1
        
    train.drop([ID, TARGET], axis=1, inplace=True)
    test.drop([ID], axis=1, inplace=True)
    
    train = preprocess2(train)    
    test = preprocess2(test)    
    
    cols_to_normalize = [ 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
                       'Horizontal_Distance_To_Fire_Points', 
                       'Shadiness_morn_noon', 'Shadiness_noon_3pm', 'Shadiness_morn_3',
                       'Shadiness_morn_avg', 
                       'Shadiness_afternoon', 
                       'Shadiness_mean_hillshade',
                       'Hydro_plus_Fire', 'Hydro_minus_Fire', 
                       'Hydro_plus_Road', 'Hydro_minus_Road', 
                       'Fire_plus_Road', 'Fire_minus_Road'
                       ]

    train[cols_to_normalize] = normalize(train[cols_to_normalize])
    test[cols_to_normalize] = normalize(test[cols_to_normalize])   
    
    x_train = train.values
    x_test = test.values

    return {'X_train': x_train, 'X_test': x_test, 'y_train': y_train}

In [14]:
def create_pipelines(dataset1, dataset2):
    rf = Classifier(dataset=dataset1, estimator = RandomForestClassifier, use_cache=CACHE, parameters=rf_params,name='rf')
    et = Classifier(dataset=dataset1, estimator = ExtraTreesClassifier, use_cache=CACHE, parameters=et_params,name='et')   
    rf1 = Classifier(dataset=dataset1, estimator=RandomForestClassifier, use_cache=CACHE, parameters=rf1_params,name='rf1')
    et1 = Classifier(dataset=dataset1, use_cache=CACHE, estimator=ExtraTreesClassifier, parameters=et1_params,name='et1')
    lgbc = Classifier(dataset=dataset1, estimator=LGBMClassifier, use_cache=CACHE, parameters=lgb_params,name='lgbc')
    et2 = Classifier(dataset=dataset1,estimator=ExtraTreesClassifier, parameters=et2_params, use_cache=CACHE, name='et2')
    xgb1 = Classifier(estimator=xgb_first, dataset=dataset1, use_cache=CACHE, name='xgb1')
    
    rf2 = Classifier(dataset=dataset2, estimator = RandomForestClassifier, use_cache=CACHE, parameters=rf2_params,name='rf2')
    et3 = Classifier(dataset=dataset2, estimator = ExtraTreesClassifier, use_cache=CACHE, parameters=et3_params,name='et3')   
    rf4 = Classifier(dataset=dataset2, estimator=RandomForestClassifier, use_cache=CACHE, parameters=rf3_params,name='rf4')
    et4 = Classifier(dataset=dataset2, use_cache=CACHE, estimator=ExtraTreesClassifier, parameters=et4_params,name='et4')
    lgbc2 = Classifier(dataset=dataset2, estimator=LGBMClassifier, use_cache=CACHE, parameters=lgb1_params,name='lgbc2')
    gnb = Classifier(dataset=dataset2, estimator=GaussianNB, use_cache=CACHE, name='gnb')
    logr = Classifier(dataset=dataset2, estimator=LogisticRegression, use_cache=CACHE, parameters=logr_params,name='logr')
    xgb2 = Classifier(estimator=xgb_first, dataset=dataset2, use_cache=CACHE, name='xgb2')
    
    return ModelsPipeline(rf, et, rf1, et1, lgbc, et2, xgb1), ModelsPipeline(rf2, et3, rf4, et4, lgbc2, gnb, logr, xgb2)

In [15]:
dataset1 = Dataset(preprocessor=preprocess_dataset1, use_cache=True)
dataset2 = Dataset(preprocessor=preprocess_dataset2, use_cache=True)

In [16]:
pipeline1, pipeline2 = create_pipelines(dataset1, dataset2)

In [17]:
stack_ds1 = pipeline1.stack(k=NFOLDS,seed=SEED)
stack_ds2 = pipeline2.stack(k=NFOLDS,seed=SEED)

Parameters: { "nrounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "nrounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "nrounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "nrounds" } might not be used.

  This could

In [18]:
def create_model_lvl2(stack_ds):
    dtrain = xgb.DMatrix(stack_ds.X_train, label=stack_ds.y_train)
    dtest = xgb.DMatrix(stack_ds.X_test)

    res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, 
             nfold=NFOLDS, seed=SEED, stratified=True,
             early_stopping_rounds=20, show_stdv=False)

    best_nrounds = res.shape[0] - 1
    model = xgb.train(xgb_params, dtrain, best_nrounds)
    return model.predict(dtest)

In [19]:
xpreds_proba1 = create_model_lvl2(stack_ds1)
xpreds_proba2 = create_model_lvl2(stack_ds2)

In [20]:
xpreds_proba_final = np.mean([xpreds_proba1, xpreds_proba2], axis=0)

In [21]:
X_train, X_test, y = dataset1.X_train, dataset1.X_test, dataset1.y_train + 1

X_train_1_2 = X_train[y <= 2]
X_train_3_6 = X_train[(y==3) | (y==6)]

y_1_2 = y[y <= 2]
y_3_6 = y[(y==3) | (y==6)]

In [22]:
clf_1_2 = ExtraTreesClassifier(n_estimators=200,n_jobs=-1,random_state=0)
clf_1_2.fit(X_train_1_2, y_1_2)

clf_3_6 = ExtraTreesClassifier(n_estimators=200,n_jobs=-1,random_state=0)
clf_3_6.fit(X_train_3_6, y_3_6)

preds_1_2 = clf_1_2.predict_proba(X_test)
preds_3_6 = clf_3_6.predict_proba(X_test)
preds = xpreds_proba_final

preds[:, 0] += preds_1_2[:, 0] / 1.3
preds[:, 1] += preds_1_2[:, 1] / 1.1
preds[:, 2] += preds_3_6[:, 0] / 3.4
preds[:, 5] += preds_3_6[:, 1] / 3.6

In [23]:
predictions = np.round(np.argmax(preds, axis=1)).astype(int) + 1
make_submission(predictions)