In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance

# Get the data
Read the data in as a dataframe using pandas. The data file is called: "African_data.txt". The data include [bioclimatic variables](https://www.worldclim.org/data/bioclim.html) (the average and standard deviation across a species' range), the minimum and maximum latitude and longitude of the species range, and the area of the species range.

In [None]:
Africa_data = pd.read_csv("Africa_data.txt", sep="\t", header=0)
Africa_data.head()

# Change Red List Status Labelling and remove some rows

The below code will just modify the response variable to match the categories used in the paper (e.g., the table lists Near Threatened species as (LR/nt) and vulnurable species as (LR/cd) where cd stands for conservation dependent.)

We also remove rows with no red list status defined, and rows where there were fewer than four GPS points.

In [None]:
# Condition to identify rows where 'Red.List.status' is "LR/nt"
condition = Africa_data['Red.List.status'] == "LR/nt"
# Modify the values based on the condition
Africa_data.loc[condition, 'Red.List.status'] = "NT"

# Condition to identify rows where 'Red.List.status' is "LR/cd"
condition = Africa_data['Red.List.status'] == "LR/cd"
# Modify the values based on the condition
Africa_data.loc[condition, 'Red.List.status'] = "VU"

# Condition to identify rows where 'Red.List.status' is "LR/lc"
condition = Africa_data['Red.List.status'] == "LR/lc"
# Modify the values based on the condition
Africa_data.loc[condition, 'Red.List.status'] = "LC"

# remove rows without a defined red list status
filtered_Africa_data = Africa_data.dropna(subset=['Red.List.status'])

# remove rows with fewer than 4 samples
filtered_Africa_data = filtered_Africa_data[filtered_Africa_data['n.gps'] >= 4]


# Check how many observations belong in each category

Random Forest Classifiers may not perform well when we have different numbers of observations in each category.

In [None]:
status_counts = filtered_Africa_data['Red.List.status'].value_counts().reset_index()

# Rename the columns for clarity
status_counts.columns = ['Red List Status', 'Count']

# Display the table
print(status_counts)

# Add variable indicating whether a species is LC or not LC

Notice that we have the most observations for least concern (LC). The authors of this paper decided to only try to predict whether a species was least concern or not. Below, we add a response column that only includes 'LC' and 'NonLC'.

In [None]:
filtered_Africa_data['Response'] = np.where(filtered_Africa_data['Red.List.status'] == 'LC', 'LC', 'NonLC')

# Prepare dataset for training

Next, we prepare our dataset for training by removing columns that we won't use, separating out our labels and features, and splitting the data into training and testing sets.

In [None]:
# List of columns to remove during trainign (i.e., columns we don't want to use as features or respjonses)
columns_to_remove = ['name', 'n.gps', 'continents', 'dist']

# Removing columns using drop()
X = filtered_Africa_data.drop(columns=columns_to_remove)

# separe labels and features
y = X['Response']
X = X.drop('Response', axis=1)

# separate original labels
y_full = X['Red.List.status']
X = X.drop('Red.List.status', axis=1)

# Split the dataset into training, testing, and validation sets
# NOTE: DO NOT TOUCH THE TESTING SET 
# NOTE: DO NOT CHANGE THESE RANDOM_STATE VARIABLES
X_train, X_test_val, y_train, y_test_val, y_full_train, y_full_test_val = train_test_split(X, y, y_full, test_size=0.4, random_state=40)
X_test, X_val, y_test, y_val, y_full_test, y_full_val = train_test_split(X_test_val, y_test_val, y_full_test_val, test_size=0.5, random_state=40)

# Scale

Let's scale our features to put everything on the same scale (this can be important when evaluating feature importance.)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_scaled = scaler.transform(X)

# Train the Random Forest Classifier, and check the out of bag score.

The out of bag score is the accuracy, not the error. To get the error, subtract this from one.

**Change**: Try changing the number of estimators (i.e., trees) and explore how this impacts accuracy.

**6990**: Write a for loop to explore different numbers of estimators, and choose the best number to use in your model.

In [None]:
# fit the classifiers
randomforest = RandomForestClassifier(n_estimators=100, oob_score=True)
randomforest.fit(X_train_scaled, y_train)

# print the oob score
print(randomforest.oob_score_)

# Make Predictions on the training data and assess performance

In [None]:
# Make predictions
y_pred_train = randomforest.predict(X_train_scaled)

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred_train)

print(f'Accuracy: {accuracy:.2f}')

ConfusionMatrixDisplay.from_estimator(
    randomforest, X_train_scaled, y_train)

# Make Predictions on the test data and assess performance

**Add**: Add code to make predictions and assess performance on the test data. Include accuracy and a confusion matrix.

# Impurity variable Importance

Below, we calculate feature importance using GINI impurity.

In [None]:
# Create a series containing feature importances from the model and feature names from the training data
feature_importances = pd.Series(randomforest.feature_importances_, index=X_train.columns).sort_values(ascending=False)[0:5]

# Plot a simple bar chart
feature_importances.plot.bar();

# Permutation Variable Importance
We will now calculate Mean Decrease in Accuracy, our permutation measure of importance.

In [None]:
permutation_feature_importances = permutation_importance(randomforest, X_test_scaled, y_test,
                                        n_repeats=50,
                                        random_state=0)

# Create a series containing feature importances from the model and feature names from the training data
plot_permutation_feature_importances = pd.Series(permutation_feature_importances['importances_mean'], index=X_test.columns).sort_values(ascending=False)[0:5]

# Plot a simple bar chart
plot_permutation_feature_importances.plot.bar();

# You try it!

Try building a Random Forest classifier that Considers all the classes instead of collapsing things into LC or Non-LC.

I have already prepared the labels for you (y_full_train, y_full_test, y_full_val).

You will need to do the following:
1. Train the Random Forest Classifier, and check the out of bag score.
2. Make Predictions on the training data and assess performance.
3. Make Predictions on the testing data and assess performance.
4. Plot impurity variable importance.
5. Plot permutation variable importance.

# Final measure of accuracy

After you have decided which model you favor, measure the accuracy using the validation dataset.