# **Lesson 1: The KNearestNeighbors Algorithm On The Titanic Dataset**

In [1]:
# Import the libraries we will need to use
# For today's lesson, only the basic libraries are necessary

import pandas as pd
import numpy as np

In [2]:
# Read the data into a dataframe object

df = pd.read_csv("/kaggle/input/titanic/train.csv")

# View the first 5 rows 

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Identify the target class and seperate it from the data

targets = df["Survived"]

df.drop(["Survived", "PassengerId"], axis=1, inplace=True)

# The parameter inplace creates and saves changes directly to df variable

In [4]:
# Look at what we are trying to predict -- how balanced is it?

targets.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [5]:
# Machine learning models typically can't handle NaN value
# We can fill them in a few different ways

df.isnull().any()

Pclass      False
Name        False
Sex         False
Age          True
SibSp       False
Parch       False
Ticket      False
Fare        False
Cabin        True
Embarked     True
dtype: bool

In [6]:
# Based on the above results, define a list containing the nan columns

nan_cols = ["Age", "Cabin", "Embarked"]

# Fill these columns in with most frequent value; An alternative would be to remove all rows containing NaN elements

for col in nan_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [7]:
# Check if w.align filled in the nan values

df.isnull().any()

Pclass      False
Name        False
Sex         False
Age         False
SibSp       False
Parch       False
Ticket      False
Fare        False
Cabin       False
Embarked    False
dtype: bool

In [8]:
# Determine the usefulness of the columns -- Are their any we should drop?

df.drop("Name", axis=1, inplace=True)

In [9]:
# Create a list storing the column names of our data, and determine which are categorical
# and numerical

df_cols = df.columns 
cat_cols = [col for col in df_cols if df[col].dtype == "object"]
num_cols = [col for col in df_cols if col not in cat_cols]

print(num_cols)
print(cat_cols)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Sex', 'Ticket', 'Cabin', 'Embarked']


In [10]:
# Algorithms like numbers, not text. 
# How should we convert the categorical columns to numbers?

for col in cat_cols:
    elements = df[col].unique()
    
    element2idx = {v : k for k, v in enumerate(elements)}
    
    df[col] = df[col].apply(lambda x: element2idx[x])
    
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,0,22.0,1,0,0,7.25,0,0
1,1,1,38.0,1,0,1,71.2833,1,1
2,3,1,26.0,0,0,2,7.925,0,0
3,1,1,35.0,1,0,3,53.1,2,0
4,3,0,35.0,0,0,4,8.05,0,0


In [11]:
# Now that our data is processed, and algorithm friendly, let's model this data
# For today's lesson, we will use the one of the simplest algorithms -- KNearest Neighbors 


class KNearestNeighborsClassifier():
    """
    For any given sample, this algorithm will try to find
    the K most similar samples, and then returns the most 
    frequent class among those samples. For example, let's 
    say that we are looking for '5 neighbors', which respective
    classes 1, 2, 1, 1, 2. The most common amongst them is 1. 
    Our model will predict 1.
    """
    
    ### What parameter(s) should our class take in
    
    def __init__(self, data, labels, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.data = data
        self.labels = labels
        
    ### How should we measure the similarity between two samples?
        
    def compute_similarity_euc(self, sample1, sample2):
        total = 0
        
        for feat1, feat2 in zip(sample1, sample2):
            total += (feat1 - feat2) ** 2
            
        total **= (1/2)
        
        return total
    
    ### Look through all of our data, and extract the most similar samples
    
    def most_similar_samples(self, sample):
        sample_similarities = []
        
        for sample_ in self.data:
            sample_similarities.append(self.compute_similarity_euc(sample, sample_))
            
        indices = np.argsort(sample_similarities)[::-1]
        
        return indices[:self.n_neighbors]
    
    ### Helper function to calculate the most frequent value of a list
    
    def mode(self, values):
        count_dict = {v : 0 for v in values}
        
        for v in values:
            count_dict[v] += 1
            
        return max(count_dict, key=count_dict.get)
    
    ### How can we use our functions above to predict new samples?
    
    def predict(self, X):
        predictions = []
        for sample in X:
            closest_neighbors = self.most_similar_samples(sample)
            neighbor_labels = self.labels[closest_neighbors]
            predictions.append(self.mode(neighbor_labels))
            
        return predictions

In [12]:
# Now that we have our data analyzed and processed, convert to numpy arrays

data = df.values 
targets = targets.values

In [13]:
# Define training data, and testing data
# Determine test size

np.random.seed(42)
np.random.shuffle(data)
np.random.seed(42)
np.random.shuffle(targets)

test_size = 0.2 

train_length = int(len(data)*0.2)

X_train, X_valid = data[:train_length], data[train_length:]
y_train, y_valid = targets[:train_length], targets[train_length:]

In [14]:
# Instantiate our model, and make predictions on new data

model = KNearestNeighborsClassifier(X_train, y_train, 10)
predictions = model.predict(X_valid)

In [15]:
# Helper function to determine how accurate our model is

def accuracy(x, y):
    counter = 0

    for x_, y_ in zip(x, y):
        counter += int(x_ == y_)
        
    return counter/len(x)

In [16]:
# Algorithm Analysis -- How good is our algorithm and how can we improve it

print(f"Baseline Accuracy: {accuracy(y_valid, predictions)}")

Baseline Accuracy: 0.605890603085554


In [17]:
# To improve an algorithm, we must "tune" the algorithm
# This means finding the best number of neighbors based on accuracy in our case

neighbor_candidates = [x for x in range(1, 20)]
results = []


for n_neighbors in neighbor_candidates:
    
    print(f"Training Algorithm With {n_neighbors} Neighbors")
    
    model = KNearestNeighborsClassifier(X_train, y_train, n_neighbors)
    predictions = model.predict(X_valid)
    
    accuracy_score = accuracy(y_valid, predictions)
    
    results.append(accuracy_score)

Training Algorithm With 1 Neighbors
Training Algorithm With 2 Neighbors
Training Algorithm With 3 Neighbors
Training Algorithm With 4 Neighbors
Training Algorithm With 5 Neighbors
Training Algorithm With 6 Neighbors
Training Algorithm With 7 Neighbors
Training Algorithm With 8 Neighbors
Training Algorithm With 9 Neighbors
Training Algorithm With 10 Neighbors
Training Algorithm With 11 Neighbors
Training Algorithm With 12 Neighbors
Training Algorithm With 13 Neighbors
Training Algorithm With 14 Neighbors
Training Algorithm With 15 Neighbors
Training Algorithm With 16 Neighbors
Training Algorithm With 17 Neighbors
Training Algorithm With 18 Neighbors
Training Algorithm With 19 Neighbors


In [18]:
# Identify the best parameters

for n_neighbor in neighbor_candidates:
    print(f"Neighbors: {n_neighbor} | Accuracy: {results[n_neighbor-1]}")

Neighbors: 1 | Accuracy: 0.5021037868162693
Neighbors: 2 | Accuracy: 0.5021037868162693
Neighbors: 3 | Accuracy: 0.4950911640953717
Neighbors: 4 | Accuracy: 0.4950911640953717
Neighbors: 5 | Accuracy: 0.47685834502103785
Neighbors: 6 | Accuracy: 0.4782608695652174
Neighbors: 7 | Accuracy: 0.6143057503506312
Neighbors: 8 | Accuracy: 0.6072931276297335
Neighbors: 9 | Accuracy: 0.5946704067321178
Neighbors: 10 | Accuracy: 0.605890603085554
Neighbors: 11 | Accuracy: 0.6072931276297335
Neighbors: 12 | Accuracy: 0.6086956521739131
Neighbors: 13 | Accuracy: 0.6044880785413744
Neighbors: 14 | Accuracy: 0.605890603085554
Neighbors: 15 | Accuracy: 0.6072931276297335
Neighbors: 16 | Accuracy: 0.6072931276297335
Neighbors: 17 | Accuracy: 0.6016830294530154
Neighbors: 18 | Accuracy: 0.603085553997195
Neighbors: 19 | Accuracy: 0.6072931276297335


In [19]:
# Determine the best neighbor count, and make our final predictions!

model = KNearestNeighborsClassifier(X_train, y_train, 7)
predictions = model.predict(X_valid)