In [30]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# Make sure we have a gpu to run this large amount of code on 

In [31]:
# do we have a gpu to run on
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Import Data

In [32]:
df = pd.read_csv('cover_data.csv')
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,class
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [33]:
# check the value counts and see if were dealing with 10 categorical values or thousands of continuous values 
df['class'].value_counts() # has 7 values represending the types of trees the data is looking at
df['Horizontal_Distance_To_Hydrology'].value_counts() # from the 5 rows above it looks like it might be a small range but it contains a large range of values
df['Slope'].value_counts()# from the 5 rows above it looks like it might be a small range but it contains a large range of values

Slope
11    33824
10    33812
12    33217
13    32419
9     32049
      ...  
65        2
58        1
64        1
63        1
66        1
Name: count, Length: 67, dtype: int64

In [34]:
# create a dictionary defineing the type of tree with the number assosicated with it. 
class_dict = {'spruce': 1, 'lodgepole_pine' : 2, 'ponderosa_pine': 3, 'cottonwood_willow': 4, 'aspen': 5, 'douglas_fir': 6, 'krummholz': 7}

In [35]:
# are their any columns that contain NaN values or are their any Dtypes that need to be changed? 
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   Elevation                           581012 non-null  int64
 1   Aspect                              581012 non-null  int64
 2   Slope                               581012 non-null  int64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  int64
 4   Vertical_Distance_To_Hydrology      581012 non-null  int64
 5   Horizontal_Distance_To_Roadways     581012 non-null  int64
 6   Hillshade_9am                       581012 non-null  int64
 7   Hillshade_Noon                      581012 non-null  int64
 8   Hillshade_3pm                       581012 non-null  int64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  int64
 10  Wilderness_Area1                    581012 non-null  int64
 11  Wilderness_Area2                    581012 non-null 

# Data Cleaning
## What do we want to remove? 
We dont have NaN values in any rows and the dtypes look appropriate.  
## Are their any biases in any columns of data? 
-This seems to be a hard question to answer in this dataset.   
--Tree come from very specific climates and soil types so they can survive.   
--One could say the soil types are biased to specific tree types or even a combination of some soil types.   
--Soil types are apart of the tree in my opinion and need to stay. 

-It also seems that Elevation could be bias for the same reason soil could be.  
--If trees grow in a certain elevation is that not concern for bias?  

My point is that looking at the data almost every peice of data in some way point directly to a location.  
But the idea of a bias is that if you get new information from somewhere else that contained a different soil type   
in a differnt elevation and in a different wilderness area you would end up the wrong tree type because the model cant generalize well on biased data. 

Its hard to say that we would end up with a different soil type and different elevation when looking at what trees need to grow properly.   
For example - Trees need specific tempurture ranges, specific watering amounts, specific soil acidity or alkalinity, and nutrients that the trees roots are able to access in that soil type. 

In [36]:
# change the number assosiated with df.class to a range from 0 - 6
new_class = []
for i in df['class']:
    new_labels = i - 1
    new_class.append(new_labels)


In [37]:
# add the new range of number to the class column
df['class'] = new_class
# check and make sure the range of numbers is changed
print(df['class'].value_counts())
# what is the distribution of yes and no in wilderness areas
print(df['Wilderness_Area1'].value_counts())
print(df['Wilderness_Area2'].value_counts())

class
1    283301
0    211840
2     35754
6     20510
5     17367
4      9493
3      2747
Name: count, dtype: int64
Wilderness_Area1
0    320216
1    260796
Name: count, dtype: int64
Wilderness_Area2
0    551128
1     29884
Name: count, dtype: int64


# Seperate Features And Labels 

In [38]:
# condenses the amount of data to make running models quicker
# could cause inbalanced data in the training data - only use if your computer is super slow
# random_seed = 42
# feature_sample = df.sample(n=100000, random_state = random_seed) ## uncomment if you need this code

# drop the wilderness and soil type columns because they dont need to be normalized. - we will bring these columns back
drop_columns = df.columns[10:54]

# create features
features = df.drop(['class'], axis=1)
features = features.drop(drop_columns, axis=1)
feature_names = [i for i in features]

# confirm we have the righe features
print(feature_names)

# create labels 
labels = df['class']

['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']


# Split the Data

In [39]:
from sklearn.model_selection import train_test_split
feature_train, feature_test, label_train, label_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

# Normalize the Data 

### We need to normalize the data so that every value is between 0 - 1. 
### This is how the Keras Model interprets information the easiest. 

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.utils.class_weight import compute_class_weight

# Normalize the features from both the train and test sets 
ct = ColumnTransformer([('normalize', Normalizer(), feature_names)], remainder='passthrough')
feature_train = ct.fit_transform(feature_train)
feature_test = ct.transform(feature_test)
print(feature_train)

[[0.59815361 0.03376521 0.00377265 ... 0.04678086 0.02772898 0.60249216]
 [0.65775272 0.04353564 0.00557599 ... 0.05404424 0.03774518 0.16985333]
 [0.63158638 0.06256104 0.000747   ... 0.04369935 0.0302534  0.42186684]
 ...
 [0.56541143 0.01403631 0.00125967 ... 0.04102922 0.02375376 0.62389606]
 [0.56287816 0.05327365 0.00157718 ... 0.04135717 0.03136837 0.07360175]
 [0.54570083 0.00912882 0.00101431 ... 0.03820582 0.02316017 0.51560952]]


# Model Creation

## We are using a TensorFlow Keras Sequential Model. 


In [41]:
# make a path to a file to hold the best model as it goes through the iterations(epochs)
checkpoint_path = 'dlsp-portfolio-starter-code/model-9-'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_path,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
)

# model = tf.keras.models.load_model('dlsp-portfolio-starter-code/model-9-') # uncomment this if you want to load an existing model and weights

# build the model using relu(most common) activation with a high neuron count due to the size and complexity of the dataset. 
model = Sequential()
model.add(layers.Dense(512, activation='relu', input_shape= feature_train.shape[1:]))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(7, activation='softmax'))

# compile the the optimizer, loss and metrics for the model
model.compile(optimizer=tf.keras.optimizers.Adamax(learning_rate=.0068507), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# check the amount of parameters we are working with 
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 512)               5632      
                                                                 
 dense_11 (Dense)            (None, 256)               131328    
                                                                 
 dense_12 (Dense)            (None, 128)               32896     
                                                                 
 dense_13 (Dense)            (None, 64)                8256      
                                                                 
 dense_14 (Dense)            (None, 7)                 455       
                                                                 
Total params: 178,567
Trainable params: 178,567
Non-trainable params: 0
_________________________________________________________________


# Fit the Model

This is a lot of data to look at. It's over 500,000 rows of data with 53 rows 500,000 X 53 is a lot of variables to look at.  
That being said its going to take time to run this code depending on your machine/GPU.  
I am running a quatro p1000 and each epoch at a batch size of 57 takes about 30 seconds or more.  

The smaller the batch size the better a model can generalize to some extent. Also the smaller the batch size the longer the model will take per epoch.  
Lower batch sizes around 30 takes about a minute or more per epoch. The opposite is also true. A batch size of 100 will take about 15 sec. but will under perform and overfit the model.  



In [25]:
model.fit(feature_train, label_train, epochs=10, batch_size=57,  validation_data=(feature_test, label_test), callbacks=[model_checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21e5182b9a0>

Running this model for 300 epochs I got a validation accuracy of 94%. This is a great number to get considering some imbalance in the dataset.   
We could get better scores and predictions if we had a more balanced dataset. The next cell below shows that imbalance.

# Test Prediction 

## Shows the accruacy score of each categoical variable.

In [27]:
# load and test the saved model
model = tf.keras.models.load_model('dlsp-portfolio-starter-code/model-9-') # uncomment if you want to load and predict the saved checkpoint model.
names = ['spruce','lodgepole_pine', 'ponderosa_pine', 'cottonwood_willow', 'aspen', 'douglas_fir', 'krummholz']
y_estimate = model.predict(feature_test)
y_estimate = np.argmax(y_estimate, axis=1)
y_true = label_test
print(classification_report(y_true, y_estimate, target_names= names))

                   precision    recall  f1-score   support

           spruce       0.92      0.95      0.94     42368
   lodgepole_pine       0.95      0.93      0.94     56661
   ponderosa_pine       0.94      0.92      0.93      7151
cottonwood_willow       0.83      0.84      0.83       549
            aspen       0.84      0.83      0.83      1899
      douglas_fir       0.89      0.86      0.88      3473
        krummholz       0.94      0.95      0.94      4102

         accuracy                           0.94    116203
        macro avg       0.90      0.90      0.90    116203
     weighted avg       0.94      0.94      0.94    116203



Looking at the support column we can see that cottonwood_willow has a very very small amount of data to train on compared to spruce or lodgepole_pine.  
If we could collect more data on the smaller datasets we could get more accurate predictions in future models.  
That being said their could be geographical issues collecting that data. Those trees might just be in decline and we just dont have trees to collect data on.  
It's possible changes in climate or deforestation might be a cause of that. 

# Save Model

In [1]:
# run code if you want to save the final model.
model.save('Downloads/dlsp-portfolio-starter-code/model-931')

NameError: name 'model' is not defined

# Random Search Cross Validation 
## Looks for the best parameters within the listed parameter Min/Max 

In [37]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import uniform, randint


x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


# Function to create a Keras model
def create_model(learning_rate=0.001, batch_size=64, activation='relu', optimizer='adam'):
    model = Sequential()
    model.add(Dense(512, activation=activation, input_shape=(x_train.shape[1:])))
    model.add(layers.Dense(256, activation=activation))
    model.add(layers.Dense(128, activation=activation))
    model.add(layers.Dense(64, activation=activation))
    model.add(Dense(7, activation='softmax'))
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Wrap the Keras model in a scikit-learn classifier
keras_model = KerasClassifier(build_fn=create_model, epochs=10, verbose=1, validation_data=(x_test, y_test))

# Specify hyperparameter space for RandomizedSearchCV
param_dist = {
    'learning_rate': uniform(0.001, 0.006),
    'batch_size': randint(32, 64),
    'activation': ['relu'],
    'optimizer' : ['adamax']
    
}

# Define scoring function (you can use any metric suitable for your problem)
scorer = make_scorer(accuracy_score)

# Generate synthetic data (replace with your actual data)


# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=keras_model,
    param_distributions=param_dist,
    scoring='accuracy',
    cv=3,
    n_iter=5,
    verbose=1
)

# Fit the RandomizedSearchCV
random_search.fit(x_train, y_train)

# Print the best parameters found
print("Best Parameters: ", random_search.best_params_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Epoch 1/10


  keras_model = KerasClassifier(build_fn=create_model, epochs=10, verbose=1, validation_data=(x_test, y_test))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
E

# Best Model

In [44]:
# show the best score we got from the models we ran above
print(random_search.best_score_)
# what versions of model did the randomsearch run
print(random_search.cv_results_)

0.8106426783605515
{'mean_fit_time': array([249.8179125 , 327.54888813, 265.5462025 , 448.97699745,
       526.72977042]), 'std_fit_time': array([  0.96093573,   3.46053334,   6.85088201, 157.69357347,
         2.43115394]), 'mean_score_time': array([ 8.04122535,  8.06960646,  8.45916986, 19.24322987, 10.71003962]), 'std_score_time': array([ 0.03160856,  0.05284952,  0.26226119, 12.62113346,  0.20072238]), 'param_activation': masked_array(data=['relu', 'relu', 'relu', 'relu', 'relu'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_batch_size': masked_array(data=[59, 44, 57, 60, 42],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate': masked_array(data=[0.0027053511266219815, 0.005944306073906301,
                   0.006850703871388203, 0.005978753744738821,
                   0.004438415190709082],
             mask=[False, False, False, False, F

# Outcome
The Random Search is picking the most consistantly progressive model based on accuracy. Even though the last model ended up at a better validation accuracy it lost accruacy more times than the 3rd model.   
It had a mean test score of .81 where the last model had a mean test score of .806
To be fair this is only running on 10 epochs so it is possible with more epochs the last model might out perform the 3rd model. 
It is also worth considering that the randomsearch only ran 15 models so their could be a more optimal model in the mix of all those models.  
Running random search a few more times might get me a 95% or higher score if I had days to run a large number of random searches. 

# Base line Model

In [46]:
from sklearn.metrics import make_scorer, accuracy_score
majority_class = np.argmax(np.bincount(label_train))
baseline_predictions = np.full_like(label_test, fill_value=majority_class)
baseline_accuracy = accuracy_score(label_test, baseline_predictions)

print(f'Baseline Accuracy: {baseline_accuracy:.4f}')

Baseline Accuracy: 0.4876


This shows our 94% accuracy is way better than the baseline model which only give an accuracy of 48% trying to guess the majority class. 

# Conclusion

In this project, a robust neural network model was developed using TensorFlow's Keras Sequential architecture to predict tree species from a diverse set of environmental variables. The model demonstrated exceptional performance, achieving precision, recall, and F1-score values above 0.92 for key species like spruce and lodgepole pine. With an overall accuracy of 94%, the model showcased strong generalization capabilities across the dataset. While slightly lower performance was observed for certain species like cottonwood willow and aspen, the findings highlight the model's nuanced understanding of environmental features' relationships with tree species. This work not only contributes to the effective application of machine learning in environmental science but also suggests avenues for refinement and extension, positioning the model as a valuable tool for ecological studies and biodiversity conservation.