# CS-513 Midterm 1

## Question 5
___
Wyatt Blair

3/26/24
___


#5 (25 Points)

Load the CANVAS “Admission_v2.CSV” dataset into R/Python. Remove the missing values.  Construct an unweighted knn = (3,5,7) model to classify admission (Admit=’YES’) based on the other variables. Predict admission for a random sample (30%) of the data (test dataset). Measure the accuracy of the model for each K.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import functools

In [2]:
dataset = pd.read_csv("../data/Admission_v2.csv", index_col='Applicant')

dataset['GPA'] = dataset['GPA'].replace(' ', np.nan).astype(np.float64)
dataset['GRE'] = dataset['GRE'].replace(' ', np.nan).astype(np.float64)
dataset['ADMIT'] = dataset['ADMIT'].apply(lambda admit: np.int_(admit == 'YES'))

dataset['GPA'] = dataset['GPA'].fillna(dataset['GPA'].mean())
dataset['GRE'] = dataset['GRE'].fillna(dataset['GRE'].mean())

dataset.isnull().any()

ADMIT    False
GRE      False
GPA      False
RANK     False
dtype: bool

In [3]:
def min_max_normalization(series):

    minimum, maximum = series.min(), series.max()
    normalized_data = (series - minimum) / (maximum - minimum)

    return normalized_data

def z_score_standardization(series):

    mean, std = series.mean(), series.std()
    normalized_data = (series - mean) / std

    return normalized_data


In [4]:
min_max_normalized_dataset = dataset.apply(min_max_normalization, axis=0)
z_score_normalized_dataset = dataset.apply(z_score_standardization, axis=0)

In [5]:
using_dataset = min_max_normalized_dataset

train = using_dataset.sample(frac=0.7)
test = using_dataset.drop(train.index)

In [6]:
# borrowing some code from HW-3-KNN

def minkowski_distance(x, y, p):

    diff = x - y
    power = diff ** p
    total = power.sum(axis=1)
    distance = total ** (1/p)

    return distance

def euclidean_distance(x, y):
    
    return minkowski_distance(x, y, p=2)

def manhattan_distance(x, y):

    diff = x - y
    absolute = abs(diff)
    distance = absolute.sum(axis=1)

    return distance

# ========================================================================

def find_k_nearest_neighbors(point, k, distance_metric=euclidean_distance):

    neighbors = train.copy()
    feat_columns = ['GRE', 'GPA', 'RANK']

    y = np.broadcast_to(point[feat_columns].values, (len(train.index), len(feat_columns)))

    x_data = neighbors[feat_columns].copy()
    x = x_data.values

    neighbors['distance'] = distance_metric(x, y)
    neighbors.sort_values(by='distance', ascending=True, inplace=True)
    nearest_neighbors = neighbors.head(k)

    return nearest_neighbors

def classify(neighbors):

    neighbor_classes = neighbors['ADMIT']
    counts = neighbor_classes.value_counts()
    closest_class = counts.sort_values().index[0]

    return closest_class

def predict(point, k, distance_metric=euclidean_distance):

    nearest_neighbors = find_k_nearest_neighbors(point, k, distance_metric=distance_metric)
    classification = classify(nearest_neighbors)

    return classification

# ========================================================================

def test_knn(distance_metric, k_vals=[1, 3, 5, 7]):
    
    accuracy_dict = {}
    df = test.copy()

    for k in k_vals:

        predict_func = functools.partial(
            predict,
            k=k,
            distance_metric=distance_metric,
        )

        df[f"prediction_k={k}"] = df.apply(predict_func, axis=1)
        df[f"correct_k={k}"] = df['ADMIT'] == df[f'prediction_k={k}']

        percent_correct = df[f"correct_k={k}"].value_counts()[True] / len(df.index)
        accuracy_dict.update({f'accuracy_k={k}' : percent_correct})
    
    return accuracy_dict

In [7]:
test_knn(euclidean_distance)

{'accuracy_k=1': 0.6333333333333333,
 'accuracy_k=3': 0.5166666666666667,
 'accuracy_k=5': 0.425,
 'accuracy_k=7': 0.30833333333333335}

In [8]:
test_knn(manhattan_distance)

{'accuracy_k=1': 0.625,
 'accuracy_k=3': 0.575,
 'accuracy_k=5': 0.36666666666666664,
 'accuracy_k=7': 0.275}

In [9]:
test_knn(functools.partial(minkowski_distance, p=6))

{'accuracy_k=1': 0.6416666666666667,
 'accuracy_k=3': 0.5166666666666667,
 'accuracy_k=5': 0.43333333333333335,
 'accuracy_k=7': 0.325}

In [10]:
test_knn(functools.partial(minkowski_distance, p=4))

{'accuracy_k=1': 0.6333333333333333,
 'accuracy_k=3': 0.5166666666666667,
 'accuracy_k=5': 0.43333333333333335,
 'accuracy_k=7': 0.3}

In [11]:
test_knn(functools.partial(minkowski_distance, p=8))

{'accuracy_k=1': 0.6333333333333333,
 'accuracy_k=3': 0.525,
 'accuracy_k=5': 0.43333333333333335,
 'accuracy_k=7': 0.325}

### Based on these results, the best performing KNN-Model within the parameters of the question is k=3 using any of the three defined distance metrics. I decided to test k=1 to see how it performed, and it out-performed k=3 on the test dataset making it the best model. However, as I said, within the bounds of the midterm question the k=3 model performs the best.