In [None]:
#from sklearn import datasets
#import pandas as pd
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression
#from sklearn.svm import SVC
#from sklearn.metrics import accuracy_score, confusion_matrix
#import seaborn as sns
#import matplotlib.pyplot as plt
#import numpy as np
#from sklearn.preprocessing import StandardScaler

# Read in the Data

Read the ants data into a dataframe.

**Change**: Change 'filename' to the name of the file.

In [None]:
# load ants dataset
dataframe = pd.read_csv('filename')

# replace species names with zeros and ones
dataframe['species'] = dataframe['species'].replace('zeteki', 0)
dataframe['species'] = dataframe['species'].replace('fovouros', 1)

# view data
dataframe.head()

# Plot the data, and choose variables

Now, we will plot the data. You need to choose two features that you want to use to predict species identity. To learn more about what the features are, see the caption of Figure 1 in the paper: https://peerj.com/articles/11622/#fig-1.

**Change**: Change the strings 'variable1' and 'variable2' to the variables you want to plot. We are just using variable names in this case, not direct pointers to the data.

In [None]:
# Choose your features, and set species as the target label.
label1 = 'variable1'
label2 = 'variable2'
labeltarget = 'species'

# plot the results
custom_palette = {0: 'red', 1: 'yellow'}
sns.scatterplot(y=label1, x=label2, hue=labeltarget, data=dataframe, palette=custom_palette)

# Prepare the data

The code below prepares your data for the rest of the script. Try to understand what each step of the code is doing.

In [None]:
# Subset the dataframe to only include our features and target.
dataframe_subset = dataframe[[label1,label2,labeltarget]]

# Separate features (X) and target variable (y)
X = dataframe_subset.drop(labeltarget, axis=1)
y = dataframe_subset[labeltarget]
X.head()

# Split the dataset into testing, testing, and validation sets 
# NOTE: DO NOT CHANGE THESE RANDOM_STATE VARIABLES
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.5, random_state=123)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.25, random_state=123)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

# Logistic Regression: Training

Train the logistic regression model.

In [None]:
# Initialize and fit the logistic regression model
logreg = LogisticRegression(solver='lbfgs', max_iter=1000, penalty=None)
logreg.fit(X_train_scaled, y_train)

# Define a function to plot decision boundaries.

Below, I have defined a function to plot our decision boundaries. You do not need to modify this code. We will use this function later in the notebook.

In [None]:
def plot_decision_boundaries(X, Y, predicted_Y, model, label1, label2):
    """A function to plot decision boundaries."""

    # Get decision boundaries by creating a grid of potential x1 and x2 points
    h = 0.02  # Step size in the mesh
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    x1x1, x2x2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))

    # Get predictions for each point in the mesh
    Z = model.predict(np.c_[x1x1.ravel(), x2x2.ravel()])
    Z = Z.reshape(x1x1.shape)


    # Plot the scatter plot
    custom_palette = {0: 'red', 1: 'blue'}
    scatter = sns.scatterplot(x=X[:,0], y=X[:,1], hue=Y, palette=custom_palette)

    # Match contour colors with scatter plot colors
    contour = plt.contourf(x1x1, x2x2, Z, cmap=plt.cm.RdYlBu, alpha=0.2)

    # Add legend for the scatter plot
    scatter.legend()

    plt.title('Decision Boundaries')
    plt.xlabel(label1)
    plt.ylabel(label2)
    plt.show()

# Evaluate the Logistic Regression on the training data.

Below, we use several approaches to evaluate our classifier on the training data.
* accuracy: The proportion of correctly classified examples.
* confusion matrix: Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class.
* plot: Shows the true (color of point) and predicted (color of background) values for each item in the training dataset.

In [None]:
# Predictions on the training set
y_pred_train = logreg.predict(X_train_scaled)

# Evaluate the model
accuracy_train = accuracy_score(y_train, y_pred_train)
conf_matrix_train = confusion_matrix(y_train, y_pred_train)

# Print the results
print(f'Accuracy: {accuracy_train:.2f}')
print('Confusion Matrix:')
print(conf_matrix_train)

# Plot the decision boundaries
plot_decision_boundaries(X=X_train_scaled, Y=y_train, predicted_Y=y_pred_train, model=logreg, label1=label1, label2=label2)

# Evaluate the Logistic Regression on the testing data.

Below, we use several approaches to evaluate our classifier on the testing data.
* accuracy: The proportion of correctly classified examples.
* confusion matrix: Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class.
* plot: Shows the true (color of point) and predicted (color of background) values for each item in the testing dataset.

In [None]:
# Predictions on the testing set
y_pred_test = logreg.predict(X_test_scaled)

# Evaluate the model
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_matrix_test = confusion_matrix(y_test, y_pred_test)

# Print the results
print(f'Accuracy: {accuracy_test:.2f}')
print('Confusion Matrix:')
print(conf_matrix_test)

# Plot the decision boundaries
plot_decision_boundaries(X=X_test_scaled, Y=y_test, predicted_Y=y_pred_test, model=logreg, label1=label1, label2=label2)

# Fit Support Vector Machine and evaluate it on the training data

Now, we will fit a support vector machine and evaluate it on the training data. 

**Change**: Change the following things to fit different models (try a few!)
* the kernel: polynomial (***poly***) and radial basis function (***rbf***) are available.
* degree: When using the polynomical kernel, define the degree of the polynomial as some positive integer >= 1.
* C: Regularization parameter. When C is low, the classifier will look for large margins, and will allow for a lot of misclassification. When C is high, the classifier will find a smaller margin, and will penalize misclassification more.

In [None]:
# Initialize and fit the SVM model
svm_model = SVC(kernel='rbf', C=100)  # You can adjust the kernel and C parameter as needed, but to use polynomial, you need to add a degree paramter
svm_model.fit(X_train_scaled, y_train)

# Predictions on the training set
y_pred_train_svm = svm_model.predict(X_train_scaled)

# Evaluate the model
accuracy_train_svm = accuracy_score(y_train, y_pred_train_svm)
conf_matrix_train_svm = confusion_matrix(y_train, y_pred_train_svm)

# Print the results
print(f'Accuracy: {accuracy_train_svm:.2f}')
print('Confusion Matrix:')
print(conf_matrix_train_svm)

# Plot the decision boundaries
plot_decision_boundaries(X=X_train_scaled, Y=y_train, predicted_Y=y_pred_train_svm, model=svm_model, label1=label1, label2=label2)

# Fit Support Vector Machine and evaluate it on the testing data

There are issues with choosing our model based on performance on three training data. Here, we repeat what we did above, except we evaluate our model on the testing data instead of the training data.

**Change**: Change the following things to fit different models (try a few!)
* the kernel: polynomial (***poly***) and radial basis function (***rbf***) are available.
* degree: When using the polynomical kernel, define the degree of the polynomial as some positive integer >= 1.
* C: Regularization parameter. When C is low, the classifier will look for large margins, and will allow for a lot of misclassification. When C is high, the classifier will find a smaller margin, and will penalize misclassification more.

**NOTE**: When you finish experimenting in this and the previous block, choose the model you think is the most appropriate, and make sure it is the last model you fit, as we will use it below.

In [None]:
# Initialize and fit the SVM model
svm_model_v2 = SVC(kernel='rbf', C=100)  # # You can adjust the kernel and C parameter as needed, but to use polynomial, you need to add a degree paramter
svm_model_v2.fit(X_train_scaled, y_train)

# Predictions on the training set
y_pred_test_svm = svm_model_v2.predict(X_test_scaled)

# Evaluate the model
accuracy_test_svm = accuracy_score(y_test, y_pred_test_svm)
conf_matrix_test_svm = confusion_matrix(y_test, y_pred_test_svm)

# Print the results
print(f'Accuracy: {accuracy_test_svm:.2f}')
print('Confusion Matrix:')
print(conf_matrix_test_svm)

# Plot the decision boundaries
plot_decision_boundaries(X=X_test_scaled, Y=y_test, predicted_Y=y_pred_test_svm, model=svm_model_v2, label1=label1, label2=label2)

# Simple Hold Out Cross-Validation (Required for 6990 only)

In the box below, write a for loop that uses cross validation to select the best value for some parameter.

Hints:
* Select a parameter to compare (e.g., the degree of the polynomial or C)
* Define a list with the values you want to consider.
* Write a for loop to iterate over the list, train a model with the current value of the parameter, and record the error on the held out test dataset.
* Find which value of the parameter had the lowest error, and print this value.

In [None]:
# code goes here.

# Evaluate your selected model on the validation dataset

***IMPORTANT: Only run this model once, when you have finalized your decision regarding which model to use! Looking at the results on your validation data and then updating your model is <ins> not cool </ins>***.

In [None]:
# Specify the model (defined above) that you want to use moving forward
my_best_model = svm_model_v2

# Predictions on the validation set
y_pred_final_val = my_best_model.predict(X_test_scaled)

# Evaluate the model
accuracy_final_val = accuracy_score(y_test, y_pred_final_val)
conf_matrix_final_val = confusion_matrix(y_test, y_pred_final_val)

# Print the results
print(f'Accuracy: {accuracy_final_val:.2f}')
print('Confusion Matrix:')
print(conf_matrix_final_val)

# Plot the decision boundaries
plot_decision_boundaries(X=X_test_scaled, Y=y_test, predicted_Y=y_pred_final_val, model=my_best_model, label1=label1, label2=label2)