# Codecademy portfolio project - Forest Cover Classification

In this notebook I present my solution to the final portfolio project of Codecademy's intro-level course to deep learning and tensorflow. Data was provided from codecademy on geological aspects of the areas studied (obtained from the US Geological Survey and US Forest Service (USFS)), one observation being a 30x30 meter cell of forest. Labelling was determined from the USFS Region 2 Resource Information System data. 

Covertypes: 
- Spruce/Fir
- Lodgepole Pine
- Ponderosa Pine
- Cottonwood/Willow
- Aspen
- Douglas-fir
- Krummholz

581012 observations were provided with 54 features. 

### Import Packages

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

### Import Data

In [6]:
data = pd.read_csv('cover_data.csv')

### EDA

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [49]:
class_balance = data['class'].value_counts(normalize=True)

In [None]:
class_balance

In [None]:
plt.bar(height=class_balance.values, x=class_balance.index, color='teal', edgecolor='black')
plt.title('Class Imbalance', size=18)
plt.xlabel('Classes')
plt.ylabel('Percentage of all classes in the Dataset')
plt.savefig('class_imbalance')

In [None]:
sns.set_style('whitegrid')
sns_pairplot_1 = sns.pairplot(data[list(data.columns[:10]) + ['class']].sample(1000), hue='class', palette='colorblind', plot_kws={'alpha': .5})
sns_pairplot_1.savefig('pairplot_1.png')

In [None]:
# plotting all wilderness area type by forest cover type 
for i in range(4):
    countplot_wilderness = plt.figure()
    sns.countplot(x=f'Wilderness_Area{i+1}', hue='class', data=data[list(data.columns[10:14]) + ['class']], palette='colorblind')
    countplot_wilderness.savefig(f'Wilderness_Area_countplot_{i+1}.png')

In [11]:
# collecting all wilderness area type by forest cover type 
wilderness_class_tables = []
for i in range(4):
    wilderness_class_table = data.groupby('class').mean()[f'Wilderness_Area{i+1}']
    wilderness_class_tables.append(wilderness_class_table)

wilderness_class_df = pd.DataFrame(wilderness_class_tables)

In [None]:
# heatmap of wilderness area type by forest cover type
wilderness_heatmap = plt.figure()
g = sns.heatmap(wilderness_class_df, cmap='viridis', annot=True)
g.axes.xaxis.set_ticks_position('top')
g.xaxis.set_label_position('top') 
g.set_title('Proportion of Wilderness Area Types per Class', size=18)

wilderness_heatmap.savefig('wilderness_heatmap.png', bbox_inches='tight')

In [None]:
# collecting all soil type by forest cover type 
soiltype_class_tables = []
for i in range(40):
    soiltype_class_table = data.groupby('class').mean()[f'Soil_Type{i+1}']
    soiltype_class_tables.append(soiltype_class_table)

soiltype_class_df = pd.DataFrame(soiltype_class_tables)

# creating a heatmap of soil type per forest cover type
soiltype_heatmap = plt.figure(figsize=(10,10))
g = sns.heatmap(soiltype_class_df, cmap='viridis', annot=True)
g.axes.xaxis.set_ticks_position("top")
g.xaxis.set_label_position('top') 
g.set_title('Proportion of Soil Types per Class', size=18)
soiltype_heatmap.savefig('soiltype_heatmap.png', bbox_inches='tight')

### Preprocessing

Since categorical variables were already dummified, the only preprocessing steps necessary are feature scaling and label encoding.

In [14]:
# seperate predictors from the target
y = data['class']
X = data.iloc[:,:-1]

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)
X_train.shape, X_test.shape

In [68]:
# scaling the features
scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

In [69]:
# label encoding
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

### Modelling

Since dealing with tabular data. A feed-forward model was chosen with Dense layers as hidden layers.

In [None]:
# model instantiation
model = tf.keras.models.Sequential()

# input layer
model.add(layers.InputLayer(input_shape=(X_train.shape[1],)))

# hidden layers
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(14, activation='relu'))

# output layer
model.add(layers.Dense(7, activation='softmax'))

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam(0.005), metrics=[tf.keras.metrics.CategoricalAccuracy(),tf.keras.metrics.AUC()])

model.summary()

In [None]:
# model fitting
history = model.fit(X_train, y_train, epochs=50, batch_size=512, validation_split=0.3)

In [None]:
# save the model
model.save('./')

### Model Evaluation

In [82]:
# load the model
model = tf.keras.models.load_model('../')

In [None]:
# plotting the loss curve of train and validation set
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(1, 1, 1)

ax1.plot(history.history['loss'])
ax1.plot(history.history['val_loss'])
ax1.set_title('model loss')
ax1.set_ylabel('loss')
ax1.set_xlabel('epoch')
ax1.legend(['train', 'validation'], loc=[1,0])
fig.tight_layout()
fig.savefig('loss_curve.png')

plt.show()

In [None]:
# computing loss, accurcy and auc for the testset
loss, acc, auc = model.evaluate(X_test, y_test)

# collecting predictions
y_estimate = model.predict(X_test)
y_estimate = np.argmax(y_estimate, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_estimate))

In [None]:
# plotting the confusion matrix of the test set
sns.set_style('white')
fig3 = plt.figure(figsize=(10,10))
ax1 = fig3.add_subplot()
plt.title('Confusion Matrix', size=18)
ConfusionMatrixDisplay.from_predictions(y_true, y_estimate, normalize='true', values_format='.2f', ax=ax1)
plt.savefig('conf_matrix_plot.png')
plt.show()