In [15]:
from sklearn.metrics import accuracy_score
from collections import Counter
import pandas as pd 
import numpy as np 
import itertools
import random 
import warnings
warnings.filterwarnings("ignore")
random.seed(0)

# Read the data 
train_ws = pd.read_csv('train.csv')
test_ws =  pd.read_csv('test.csv')

# Join the train and test dataframes so the data preprocessing will be done simultaneously in both datasets 
full_ws = train_ws.append(test_ws, ignore_index=True)

In [16]:
def data_preprocessing(ws):
  
  # Label-encode the sex of a passenger 
  ws['Sex'] = ws['Sex'].replace(['male'],0)
  ws['Sex'] = ws['Sex'].replace(['female'],1)

  # Initialize new columns 
  ws['title'] = np.NaN
  ws['alone'] = np.NaN
  ws['cabin_class'] = np.NaN

  # Identify if a passenger is alone in the ship 
  for i,_ in enumerate(ws['alone']):
    if ws['SibSp'][i] + ws['Parch'][i] == 0:
      ws['alone'][i] = 1
    else:
      ws['alone'][i] = 0 

  # Handle missing values
  cols = ['SibSp','Parch','Fare','Age']
  for col in cols:
    ws[col].fillna(ws[col].median(), inplace = True)
    
  # Feature-engineer the cabin-class 
  for i,row in enumerate(ws['Cabin']):
    # Get cabin class 
    ws['cabin_class'][i] =  str(row)[:1]

  # Count the cabin distribution per class (if available) 
  cabin_distribution = {}
  count = 0 
  for row in ws['cabin_class']:
    if row != 'n':
      count += 1 
      if row not in cabin_distribution:
        cabin_distribution[row] = 1 
      else:
        cabin_distribution[row] +=1 

  # Calculate the probability of being in a sepcific cabin-class  
  cabin_pws = {k:v / count for k, v in cabin_distribution.items()}

  # Calculate the cumulative probability of being in a specific cabin-class 
  keys, vals = cabin_pws.keys(), cabin_pws.values()
  cabin_cws = dict(zip(keys, itertools.accumulate(vals)))
  cabin_cws = sorted(cabin_cws.items(), key=lambda x: x[1])    

  # Randomly assign cabin-classes to passengers that are missing the cabin 
  # field, based on the probabilities calculated above 
  for i,row in enumerate(ws['cabin_class']):
    random_num = random.random()
    if row == 'n':
      if random_num < cabin_cws[0][1]:
        ws['cabin_class'][i] =  cabin_cws[0][0]
      elif cabin_cws[0][1] <= random_num < cabin_cws[1][1]:
        ws['cabin_class'][i] =  cabin_cws[1][0]

      elif cabin_cws[1][1] <= random_num < cabin_cws[2][1]:
        ws['cabin_class'][i] =  cabin_cws[2][0]
      
      elif cabin_cws[2][1] <= random_num < cabin_cws[3][1]:
        ws['cabin_class'][i] =  cabin_cws[2][0]

      elif cabin_cws[3][1] <= random_num < cabin_cws[4][1]:
        ws['cabin_class'][i] =  cabin_cws[3][0]

      elif cabin_cws[3][1] <= random_num < cabin_cws[4][1]:
        ws['cabin_class'][i] =  cabin_cws[4][0]

      elif cabin_cws[4][1] <= random_num < cabin_cws[5][1]:
        ws['cabin_class'][i] =  cabin_cws[4][0]
      
      elif cabin_cws[5][1] <= random_num < cabin_cws[6][1]:
        ws['cabin_class'][i] =  cabin_cws[5][0]

      elif cabin_cws[6][1] <= random_num < cabin_cws[7][1]:
        ws['cabin_class'][i] =  cabin_cws[6][0]
      else:
        ws['cabin_class'][i] = cabin_cws[7][0]

  # Perform feature engineering to obtain additional title-info 
  for i,row in enumerate(ws['Name']):
    # Get person's title 
    ws['title'][i] = row.split(',')[1].split('.')[0]

  # Embarked one-hot encoding 
  embarked_dummies = pd.get_dummies(ws.Embarked, prefix='Embarked')
  ws = pd.concat([ws, embarked_dummies], axis=1)

  # Person's title one-hot encoding 
  title_dummies = pd.get_dummies(ws.title, prefix='title')
  ws = pd.concat([ws, title_dummies], axis=1)

  # Cabin class one-hot encoding 
  cabin_class_dummies = pd.get_dummies(ws.cabin_class, prefix = 'cabin_class')
  ws = pd.concat([ws, cabin_class_dummies], axis = 1)

  #Remove unecessary columns 
  del ws['Name']
  del ws['PassengerId']
  del ws['title']
  del ws['Embarked']
  del ws['Cabin']
  del ws['Ticket']
  del ws['cabin_class']

  return ws 

In [17]:
# Preprocess the data and create the train / test sets 
full_ws = data_preprocessing(full_ws)
X_train = full_ws[:891]
y_train = full_ws['Survived'][:891]
X_test = full_ws[891:]
del X_train['Survived']
del X_test['Survived']


print(f'After preprocessing there are {X_train.shape[0]} rows and {X_train.shape[1]} columns in the training data.\n')
print(f'After preprocessing there are {X_test.shape[0]} rows and {X_test.shape[1]} columns in the test data.')

After preprocessing there are 891 rows and 36 columns in the training data.

After preprocessing there are 418 rows and 36 columns in the test data.


In [18]:
# Since we are not going to be using any fancy modules for our algorithm 
# we will be converting our data from pandas dataframe to python lists. 
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

In [19]:
class KNearestNeighbors():
    def __init__(self,k):
        self.k = k 
        self.X_train = None  
        self.classes = []
    
    @staticmethod 
    def euclidean_distance(row1,row2):
      sum = 0
      for i,j in zip(row1,row2):
        sum += (i-j) ** 2 
      distance = sum ** 0.5 
      return distance


    def _predict(self,classes,distances,k):
      # Sort in descending order the distances and retrieve the index 
      # which maps to their corresponding class 
      idxs = sorted(range(len(distances)),key = lambda x:distances[x])[:self.k] 
      # Finding the class for each neighbor  
      neighbors = [self.classes[idx] for idx in idxs]
      # Choosing the most ocurring class 
      prediction = Counter(neighbors).most_common(1) # bit of a cheat here :)
      return prediction[0][0]

    def fit(self,X_train,y_train):
        self.X_train = X_train
        predictions = []
        # For each passenger in the training dataset:
        for i in range(len(X_train)):
            distances = []
            classes = []
            # Estimate the Euclidean distance for the current 
            # passenger with all the other passengers 
            for x,y in zip(X_train,y_train):
                distances.append(self.euclidean_distance(X_train[i],x))
                # Append the class that corresponds to the current passenger
                self.classes.append(y)
            # Predict the class for the current passenger 
            prediction = self._predict(self.classes,distances,self.k)
            predictions.append(prediction)
        print(f'Utilizing {self.k} groups of nearest-neighbors the training accuracy is at {round(accuracy_score(predictions,y_train)*100,2)}%.')
        return predictions   

    def predict(self,X_test):
        # Predictions for the test data 
        predictions = []
        for i in range(len(X_test)):
            distances = []
            for x in self.X_train:
                distances.append(self.euclidean_distance(X_test[i],x))
            prediction = self._predict(self.classes,distances,self.k)
            predictions.append(prediction)
        return predictions 

In [20]:
# Choosing an arbritrary number for k to be 4 
KNN = KNearestNeighbors(k=4)
KNN.fit(X_train,y_train)
predictions = KNN.predict(X_test)

Utilizing 4 groups of nearest-neighbors the training accuracy is at 88.78%.
