#### In this notebook I excluded the EDA section, directly showed the feature engineering process and used pretrained model from from other notebook  (Forest Cover Type Prediction)

In [31]:
import time
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score
from scipy.stats import uniform

from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
import joblib

In [32]:
train=pd.read_csv("~/Desktop/train.csv")
test=pd.read_csv("~/Desktop/test.csv")

In [33]:
print(train.shape)
print(test.shape)

(15120, 56)
(565892, 55)


### Checking for Anomalies & Outliers

I am using the logic of extreme outliers to keep as much rows I can keep. This is a standard and widely used technique for outlier detection. So following data points will be dropped if they satisfy the following conditions:

- x < Q1 - 3 * IQR
- x > Q3 + 3 * IQR

In [34]:
def outlier_function(df, col_name):

    first_quartile = np.percentile(np.array(df[col_name].tolist()), 25)
    third_quartile = np.percentile(np.array(df[col_name].tolist()), 75)
    IQR = third_quartile - first_quartile
                      
    upper_limit = third_quartile+(3*IQR)
    lower_limit = first_quartile-(3*IQR)
    outlier_count = 0
                      
    for value in df[col_name].tolist():
        if (value < lower_limit) | (value > upper_limit):
            outlier_count +=1
    return lower_limit, upper_limit, outlier_count

In [35]:
train.shape

(15120, 56)

In [36]:
train1 = train[(train['Horizontal_Distance_To_Fire_Points'] > outlier_function(train, 'Horizontal_Distance_To_Fire_Points')[0]) &
              (train['Horizontal_Distance_To_Fire_Points'] < outlier_function(train, 'Horizontal_Distance_To_Fire_Points')[1])]
train1.shape

(14988, 56)

### Feature Engineering

In [37]:
test1=test

In [38]:
train1["net_hyd_distance"]=np.sqrt(train1["Vertical_Distance_To_Hydrology"]**2 + train1["Horizontal_Distance_To_Hydrology"]**2)
test1["net_hyd_distance"]=np.sqrt(test1["Vertical_Distance_To_Hydrology"]**2 + test1["Horizontal_Distance_To_Hydrology"]**2)

The resulting net_hyd_distance is a measure of the total distance to hydrology, considering both the horizontal and vertical distance.

In [39]:
train1["mean_distance_horizontal"] = (train1["Horizontal_Distance_To_Hydrology"] + train1["Horizontal_Distance_To_Roadways"] + train1["Horizontal_Distance_To_Fire_Points"])/3
test1["mean_distance_horizontal"] = (test1["Horizontal_Distance_To_Hydrology"] + test1["Horizontal_Distance_To_Roadways"] + test1["Horizontal_Distance_To_Fire_Points"])/3

By calculating the mean distance to these amenities, the new feature "mean_distance_horizontal" gives an overall measure of how close each data point is, on average, to these important environmental features.

In [40]:
#Transforming the data and making it closer to a normal distribution 
train1["sqrtHorizontal_Distance_To_Hydrology"] = np.sqrt(train1["Horizontal_Distance_To_Hydrology"])
test1["sqrtHorizontal_Distance_To_Hydrology"] = np.sqrt(test1["Horizontal_Distance_To_Hydrology"])


In [41]:
# to get combined effect
train1["Elevation_m_HR"] = train1["Elevation"] * train1["Horizontal_Distance_To_Roadways"]
test1["Elevation_m_HR"] = test1["Elevation"] * test1["Horizontal_Distance_To_Roadways"]


In [42]:
#functions for coverting one hot encoded columns to single column with num value

def split_numbers_chars(row):
    '''This function fetches the numerical characters at the end of a string
    and returns alphabetical character and numerical chaarcters respectively'''
    head = row.rstrip('0123456789')
    tail = row[len(head):]
    return head, tail

def reverse_one_hot_encode(dataframe, start_loc, end_loc, numeric_column_name):
    ''' this function takes the start and end location of the one-hot-encoded column set and numeric column name to be created as arguments
    1) transforms one-hot-encoded columns into one column consisting of column names with string data type
    2) splits string column into the alphabetical and numerical characters
    3) fetches numerical character and creates numeric column in the given dataframe
    '''
    dataframe['String_Column'] = (dataframe.iloc[:, start_loc:end_loc] == 1).idxmax(1)
    dataframe['Tuple_Column'] = dataframe['String_Column'].apply(split_numbers_chars)
    dataframe[numeric_column_name] = dataframe['Tuple_Column'].apply(lambda x: x[1]).astype('int64')
    dataframe.drop(columns=['String_Column','Tuple_Column'], inplace=True)

In [43]:
train1.iloc[:,15:55].columns

Index(['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40'],
      dtype='object')

In [44]:
reverse_one_hot_encode(train1, 15, 55, "Soil_Type")
reverse_one_hot_encode(train1, 11, 15, "Widerness_Area_Type")

In [45]:
#doing the same for test set
reverse_one_hot_encode(test1, 15, 55, "Soil_Type")
reverse_one_hot_encode(test1, 11, 15, "Widerness_Area_Type")

In [46]:
train1.drop(columns=train1.columns[11:55], inplace=True)

In [47]:
test1.drop(columns=test1.columns[11:55], inplace=True)

In [48]:
#Removing Horizontal distance to Hydrology because net hyd distance has very strong correlation with it.
#Removing Hillshade 9am because it has strong correlation with hillshade 3pm
#Removing ID, as it is not important for prediction 

train1 = train1.drop(['Horizontal_Distance_To_Hydrology', 'Hillshade_9am', 'Id'], axis=1)
test1 = test1.drop(['Horizontal_Distance_To_Hydrology', 'Hillshade_9am', 'Id'], axis=1)

print(train1.shape)
print(test1.shape)

(14988, 15)
(565892, 14)


### Train-Validation split

In [49]:
X = train1.drop(["Cover_Type"],axis = 1)
y = np.array(train1.Cover_Type)
print(X.shape)
print(y.shape)

(14988, 14)
(14988,)


In [50]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(11990, 14)
(11990,)
(2998, 14)
(2998,)


In [51]:
X_val.head()

Unnamed: 0,Elevation,Aspect,Slope,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,net_hyd_distance,mean_distance_horizontal,sqrtHorizontal_Distance_To_Hydrology,Elevation_m_HR,Soil_Type,Widerness_Area_Type
6099,2298,162,19,0,1110,241,128,1055,0.0,721.666667,0.0,2550780,3,4
10088,3101,342,8,4,309,229,163,2100,30.265492,813.0,5.477226,958209,22,2
12849,2329,349,8,50,443,226,160,579,188.743212,401.333333,13.490738,1031747,10,4
6625,2544,307,7,18,577,236,175,1180,242.668498,666.333333,15.556349,1467888,6,4
8990,3041,123,10,0,3376,233,126,2495,0.0,1957.0,0.0,10266416,23,3


In [52]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2998 entries, 6099 to 6658
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Elevation                             2998 non-null   int64  
 1   Aspect                                2998 non-null   int64  
 2   Slope                                 2998 non-null   int64  
 3   Vertical_Distance_To_Hydrology        2998 non-null   int64  
 4   Horizontal_Distance_To_Roadways       2998 non-null   int64  
 5   Hillshade_Noon                        2998 non-null   int64  
 6   Hillshade_3pm                         2998 non-null   int64  
 7   Horizontal_Distance_To_Fire_Points    2998 non-null   int64  
 8   net_hyd_distance                      2998 non-null   float64
 9   mean_distance_horizontal              2998 non-null   float64
 10  sqrtHorizontal_Distance_To_Hydrology  2998 non-null   float64
 11  Elevation_m_HR

In [53]:
# Selecting only the numerical columns for scaling (exclude Soil_Type and Wilderness_Area_Type)
cols_for_scaler = ['Elevation', 'Aspect', 'Slope', 'Vertical_Distance_To_Hydrology',
                  'Horizontal_Distance_To_Roadways', 'Hillshade_Noon', 'Hillshade_3pm',
                  'Horizontal_Distance_To_Fire_Points', 'net_hyd_distance',
                  'mean_distance_horizontal', 'sqrtHorizontal_Distance_To_Hydrology',
                  'Elevation_m_HR']

scaler = StandardScaler()

# Fit the scaler on the training data and transform the training features
X_train[cols_for_scaler] = scaler.fit_transform(X_train[cols_for_scaler])

### saving the standard scaler

In [23]:
joblib.dump(scaler, "standard_scaler.pkl")

['standard_scaler.pkl']

In [54]:
sc=joblib.load("standard_scaler.pkl")

In [55]:
# Transforming the validation features using the scaler fitted on the training data
X_val[cols_for_scaler] = sc.transform(X_val[cols_for_scaler])

In [56]:
X_val.head()

Unnamed: 0,Elevation,Aspect,Slope,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,net_hyd_distance,mean_distance_horizontal,sqrtHorizontal_Distance_To_Hydrology,Elevation_m_HR,Soil_Type,Widerness_Area_Type
6099,-1.079695,0.055456,0.293635,-0.846355,-0.448118,0.972437,-0.149527,-0.409005,-1.107295,-0.596614,-1.797103,-0.561997,3,4
10088,0.846943,1.694471,-1.0138,-0.780849,-1.061633,0.444714,0.613471,0.627592,-0.965908,-0.463225,-1.051342,-0.931263,22,2
12849,-1.005316,1.75821,-1.0138,-0.027538,-0.958998,0.312783,0.548071,-0.881177,-0.225571,-1.06445,0.039752,-0.914212,10,4
6625,-0.489467,1.375773,-1.132658,-0.551581,-0.856362,0.752552,0.87507,-0.28501,0.026344,-0.677426,0.320999,-0.813085,6,4
8990,0.702985,-0.299664,-0.776085,-0.846355,1.287494,0.620622,-0.193127,1.019416,-1.107295,1.207548,-1.797103,1.227009,23,3


## load the trained model

In [57]:
top_classifier = joblib.load("top_classifier_model_xtree.pkl")
second_best_classifier = joblib.load("second_best_classifier_model_lgbm.pkl")
ensemble_classifier = joblib.load("ensemble_classifier_model_5.pkl")

In [58]:
ensemble_pred = ensemble_classifier.predict(X_val)

# Calculate accuracy of the stacking classifier(top5)
ensemble_accuracy = accuracy_score(y_val, ensemble_pred)

print("Ensemble Classifier_Top5:", ensemble_accuracy)

Ensemble Classifier_Top5: 0.8942628418945964


model checked

In [59]:
X_val.columns

Index(['Elevation', 'Aspect', 'Slope', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'net_hyd_distance',
       'mean_distance_horizontal', 'sqrtHorizontal_Distance_To_Hydrology',
       'Elevation_m_HR', 'Soil_Type', 'Widerness_Area_Type'],
      dtype='object')

In [60]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2998 entries, 6099 to 6658
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Elevation                             2998 non-null   float64
 1   Aspect                                2998 non-null   float64
 2   Slope                                 2998 non-null   float64
 3   Vertical_Distance_To_Hydrology        2998 non-null   float64
 4   Horizontal_Distance_To_Roadways       2998 non-null   float64
 5   Hillshade_Noon                        2998 non-null   float64
 6   Hillshade_3pm                         2998 non-null   float64
 7   Horizontal_Distance_To_Fire_Points    2998 non-null   float64
 8   net_hyd_distance                      2998 non-null   float64
 9   mean_distance_horizontal              2998 non-null   float64
 10  sqrtHorizontal_Distance_To_Hydrology  2998 non-null   float64
 11  Elevation_m_HR

In [64]:
y_val

array([4, 1, 3, ..., 5, 5, 5])