In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft

In [None]:
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
train.head()

In [None]:
# drop ID column
X_train = train.iloc[:,1:]

## Feature Engineering

#### Automated Feature Engineeering

In [None]:
es = ft.EntitySet(id = 'Forest_Cover') #Create entity

In [None]:
#Add dataset to entity
es.add_dataframe(dataframe_name = 'X_train', dataframe = X_train.iloc[:, 0:10], make_index = True, index = 'index')

In [None]:
#Run deep feature synthesis with transformations¶
X_train, features = ft.dfs(entityset = es, target_dataframe_name = 'X_train', trans_primitives = ['add_numeric', 'multiply_numeric', 'subtract_numeric'], max_depth=1)
X_train.head()

The features created above did not yield better results, so we will be excluding these from our new features

#### Manual Feature Engineering

In [None]:
X_train = train.iloc[:,1:]

In [None]:
from IPython.display import Image

def plota(a1,a2):
    fig = plt.figure(figsize=(16,8))
    sel = np.array(list(train.Cover_Type.values))
    plt.scatter(a1, a2, c=sel, s=100)
    plt.xlabel(a1.name)
    plt.ylabel(a2.name)

In [None]:
plota(train.Elevation, train.Horizontal_Distance_To_Hydrology)

In [None]:
plota(train.Elevation - 0.2 * train.Horizontal_Distance_To_Hydrology, train.Horizontal_Distance_To_Hydrology)

In [None]:
plota(train.Elevation, train.Horizontal_Distance_To_Roadways)

In [None]:
plota(train.Elevation - .05 * train.Horizontal_Distance_To_Roadways, train.Horizontal_Distance_To_Roadways)

In [None]:
plota(train.Elevation, train.Vertical_Distance_To_Hydrology)

In [None]:
plota(train.Elevation - train.Vertical_Distance_To_Hydrology, train.Vertical_Distance_To_Hydrology)

- In the plots above, the different colors of the datapoints represent specific covertypes.
- We can see the how the new features have shifted the plots.
- Becuase we are utilizing tree-based models we want to create new features that help the model group values based on certain criteria for each cover type.
- We can see how the new features in the plots have changed the distribution of covertypes in a way that can hopefully aid in our tree-based models accuracy.

In [None]:
X_train['Elev_to_Horizontal_Hyd']=X_train.Elevation - 0.2 * X_train.Horizontal_Distance_To_Hydrology 
X_train['Elev_to_Horizontal_Road']=X_train.Elevation - 0.05 * X_train.Horizontal_Distance_To_Roadways  
X_train['Elev_to_Verticle_Hyd']=X_train.Elevation - X_train.Vertical_Distance_To_Hydrology 

X_train['Mean_Horizontal_Dist']=(X_train.Horizontal_Distance_To_Fire_Points + X_train.Horizontal_Distance_To_Hydrology + 
                                 X_train.Horizontal_Distance_To_Roadways)/3 
X_train['Mean_Fire_Hydro']=(X_train.Horizontal_Distance_To_Fire_Points + X_train.Horizontal_Distance_To_Hydrology)/2

In [None]:
X_train.head()