In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification

### 1
Which of the following (select all that apply) is true for the binary classification confusion matrix report:
* a) it reports sizes of each cluster and number of the clusters;
* b) it is a square matrix;
* c) it is symmetrical;
* d) it has dimensions $2\times2$;
* e) it has dimensions $3\times3$;
* f) it reports the quality of classification fit over the test set;
* g) it reports the percentage of correctly classified datapoints over the test set;
* h) it reports the percentage of incorrectly classified datapoints over the test set;

Responses here

### 2
Given the following synthetic data and the plotting function provided for you, you are to implement two classification models: Logistic Regression and SVC.

a. Partition the data into training and testing sets of equal size.

b. Create a logistic regression model with C=10000 and fit the model with the training data

c. Report the coefficients and intercept of the fitted model (consult the API documentation for these statistics)

d. Use the plot_decision_regions function to plot the test data and the classifier

e. Construct the confusion matrix such that you can report TP, FP, TN, and FN. Using those four values, report the accuracy, precision, and recall of your predictions from the logistic regression model

f. Create a support vector classifier with C=10000 and a linear kernel. Fit the model with the training data.

g. Use the plot_decision_regions function to plot the test data and the classifier

h. Construct the confusion matrix such that you can report TP, FP, TN, and FN. Using those four values, report the accuracy, precision, and recall of your predictions from the support vector classifier

In [None]:
X, y = make_classification(n_features=2, n_samples=1500, n_classes=2, class_sep=2, n_redundant=0, random_state=0)

In [None]:
# Used for plotting decision regions
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=colors[idx],
                    marker=markers[idx], label=cl,
                    edgecolor='black')
    
    # highlight test samples
    if test_idx:
        # plot all samples
        X_test, y_test = X[test_idx, :], y[test_idx]
        
        plt.scatter(X_test[:, 0], X_test[:, 1],
                    c='', edgecolor='black', alpha=1.0,
                    linewidth=1, marker='o',
                    s=100, label='Test Set')

### (a) Split the data into training and testing

In [None]:
# Write code here

### (b) Create logistic regression model with C = 10000 to disable regularization and fit training data to it

In [None]:
# Write code here

### (c) Report coefficients and intercept from fitted model

In [None]:
# Write code here

### (d) Use the function plot_decision_regions to plot the test data with the logistic regression classifier

In [None]:
# Write code here

### (e) Construct the confusion matrix and report the model's accuracy, precision, and recall on the test data for the logistic regression classifier

In [None]:
# Write code here

### (f) Create a support vector classifier with C = 10000 and kernel = linear and train it on the training data

In [None]:
# Write code here

### (g) Use the function plot_decision_regions to plot the test data with the support vector classifier

In [None]:
# Write code here

### (h) Construct the confusion matrix and report the model's accuracy, precision, and recall on the test data for the support vector classifier

In [None]:
# Write code here

## 3

Given the countries dataset, we're going to engage in a rather interesting classification exercise. What we will try to do is classify whether a certain region is in Asia. Therefore, our target value will be the region masked as a binary integer: 0 or 1. 

1 will denote that the region is in Asia. 0 will denote otherwise.

You are tasked with preprocessing the data such that it's in a consumable format for the machine learning algorithm. You will then implement these models and do analysis on them.. Then you are to plot the data on the world map similar to previous assignments to show how accurate your classification model is.

In [None]:
countries = pd.read_csv('countries.csv')
countries['COUNTRY'] = countries['COUNTRY'].str.strip()
countries = countries.dropna()

world = gpd.read_file('World_Countries.shp')

### (a) Create a function region_to_binary which transforms the Region column to a binary value based on whether the region is in Asia or not.

In [None]:
def region_to_binary(row):
    # implement this function
    pass

binary_countries = countries.apply(region_to_binary, axis=1)
binary_countries.drop(columns=['COUNTRY'], axis=1, inplace=True)
binary_countries.dropna(inplace=True)

### (b) Split features into feature and target. In this case, target is the Region column. Afterwards, split the data into training and test set (let test_size=0.2).

In [None]:
# write code here

### (c) Implement a logistic regression model and train it on the training data

In [None]:
# write code here

### (d) Construct the confusion matrix and report the logistic regression model's accuracy, precision, and recall on the test data 

In [None]:
# write code here

### (e) Plot the predictions from the logistic regression model on the world map to show how accurate you were in predicting the correct regions.

In [1]:
# write code here

### (f) Implement a support vector classifier and train it on the training data

In [None]:
# write code here

### (g) Construct the confusion matrix and report the SVC's accuracy, precision, and recall on the test data

In [None]:
# write code here

### (h) Plot the predictions from the support vector classifier on the world map to show how accurate you were in predicting the correct regions.

In [2]:
# write code here