In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test-full.csv")

## Reversing one-hot encoding soil

In [4]:
def split_numbers_chars(row): # copied this code 
    '''This function fetches the numerical characters at the end of a string
    and returns alphabetical character and numerical chaarcters respectively'''
    head = row.rstrip('0123456789')
    tail = row[len(head):]
    return head, tail

def reverse_one_hot_encode(dataframe, start_loc, end_loc, numeric_column_name):
    ''' this function takes the start and end location of the one-hot-encoded column set and numeric column name to be created as arguments
    1) transforms one-hot-encoded columns into one column consisting of column names with string data type
    2) splits string column into the alphabetical and numerical characters
    3) fetches numerical character and creates numeric column in the given dataframe
    '''
    dataframe['String_Column'] = (dataframe.iloc[:, start_loc:end_loc] == 1).idxmax(1)
    dataframe['Tuple_Column'] = dataframe['String_Column'].apply(split_numbers_chars)
    dataframe[numeric_column_name] = dataframe['Tuple_Column'].apply(lambda x: x[1]).astype('int64')
    dataframe.drop(columns=['String_Column','Tuple_Column'], inplace=True)

In [5]:
reverse_one_hot_encode(df, 15, 55, "Soil_Type")
reverse_one_hot_encode(df_test, 15, 55, "Soil_Type")

In [6]:
cols_to_drop = list(df.iloc[:, 15:55].columns)
df.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

## Feature Engineering

In [9]:
df['Euclidian_Distance_To_Hydrology'] = (df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5
df['Mean_Elevation_Vertical_Distance_Hydrology'] = (df['Elevation'] + df['Vertical_Distance_To_Hydrology'])/2
df['Mean_Distance_Hydrology_Firepoints'] = (df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points'])/2
df['Mean_Distance_Hydrology_Roadways'] = (df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])/2
df['Mean_Distance_Firepoints_Roadways'] = (df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])/2
#df['Slope_hydrology_percent'] = df['Vertical_Distance_To_Hydrology']/df['Horizontal_Distance_To_Hydrology']
#df.Slope_hydrology_percent=df.Slope_hydrology_percent.map(lambda x: 0 if np.isinf(x) else x) 

In [10]:
df_test['Euclidian_Distance_To_Hydrology'] = (df_test['Horizontal_Distance_To_Hydrology']**2 + df_test['Vertical_Distance_To_Hydrology']**2)**0.5
df_test['Mean_Elevation_Vertical_Distance_Hydrology'] = (df_test['Elevation'] + df_test['Vertical_Distance_To_Hydrology'])/2
df_test['Mean_Distance_Hydrology_Firepoints'] = (df_test['Horizontal_Distance_To_Hydrology'] + df_test['Horizontal_Distance_To_Fire_Points'])/2
df_test['Mean_Distance_Hydrology_Roadways'] = (df_test['Horizontal_Distance_To_Hydrology'] + df_test['Horizontal_Distance_To_Roadways'])/2
df_test['Mean_Distance_Firepoints_Roadways'] = (df_test['Horizontal_Distance_To_Fire_Points'] + df_test['Horizontal_Distance_To_Roadways'])/2
#df_test['Slope_hydrology_percent'] = df_test['Vertical_Distance_To_Hydrology']/df_test['Horizontal_Distance_To_Hydrology']
#df_test.Slope_hydrology_percent=df_test.Slope_hydrology_percent.map(lambda x: 0 if np.isinf(x) else x)

In [11]:
#Mean distance to Amenities 
df['Mean_Amenities']=(df.Horizontal_Distance_To_Fire_Points + df.Horizontal_Distance_To_Hydrology + df.Horizontal_Distance_To_Roadways) / 3
df_test['Mean_Amenities']=(df_test.Horizontal_Distance_To_Fire_Points + df_test.Horizontal_Distance_To_Hydrology + df_test.Horizontal_Distance_To_Roadways) / 3

#Mean Distance to Fire and Water
df['Mean_Fire_Hyd']=(df.Horizontal_Distance_To_Fire_Points + df.Horizontal_Distance_To_Hydrology) / 2 
df_test['Mean_Fire_Hyd']=(df_test.Horizontal_Distance_To_Fire_Points + df_test.Horizontal_Distance_To_Hydrology) / 2

## Model Training

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Cover_Type', axis = 1)
y = df['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
X_train = df.drop('Cover_Type', axis = 1)
y_train = df['Cover_Type']
X_test = df_test#.iloc[:, :-1]
#y_test = df_test.iloc[:, -1]

## Setting param grid

In [None]:
list(range(1,7,1))

In [14]:
params = {'n_estimators': [50, 100, 300, 500, 1000], 'min_samples_split': list(range(2,8,1)),
          'min_samples_leaf': list(range(2,8,1)),
          'max_features' : ['auto', 'sqrt', 'log2', None]  }

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

In [16]:
clf = ExtraTreesClassifier()
random_cv = RandomizedSearchCV(estimator=clf,
                               param_distributions=params,
                               cv=4, n_iter=20, 
                               scoring = 'accuracy',
                               n_jobs = -1, verbose = 1, 
                               return_train_score = True, 
                               random_state=42)

In [17]:
random_cv.fit(X_train, y_train)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


RandomizedSearchCV(cv=4, estimator=ExtraTreesClassifier(), n_iter=20, n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7],
                                        'n_estimators': [50, 100, 300, 500,
                                                         1000]},
                   random_state=42, return_train_score=True, scoring='accuracy',
                   verbose=1)

In [18]:
random_cv.best_estimator_

ExtraTreesClassifier(max_features='sqrt', min_samples_leaf=2,
                     min_samples_split=5, n_estimators=500)

In [19]:
clf = ExtraTreesClassifier(max_features='sqrt', min_samples_leaf=2,
                     min_samples_split=5, n_estimators=500)

clf.fit(X_train, y_train)

ExtraTreesClassifier(max_features='sqrt', min_samples_leaf=2,
                     min_samples_split=5, n_estimators=500)

In [20]:
preds = clf.predict(X_test)

In [None]:
preds

In [None]:
X_test

In [None]:
accuracy

In [None]:
model = XGBClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

In [None]:
df_test

In [None]:
pred

In [23]:
print('Forest Train accuracy %s' % clf.score(X_train, y_train)) 

Forest Train accuracy 0.9944444444444445


In [21]:
X_test['Cover_Type'] = preds

In [22]:
test_sub = X_test[['Id', 'Cover_Type']]

In [25]:
test_sub.to_csv('test_extra_1.csv', index=False)

In [24]:
test_sub

Unnamed: 0,Id,Cover_Type
0,1,5
1,2,5
2,3,2
3,4,2
4,5,5
...,...,...
581007,581008,3
581008,581009,3
581009,581010,3
581010,581011,3


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)