# HW5

In [1]:
import numpy as np
import pandas as pd

## 5.1

In [2]:
data = pd.read_csv('titanic_data.csv')

data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


## (a)

In [3]:
class KNN:
    def __init__(self, k, dis = None):
        self.k = k
        if dis == None:
            self.dis = self._euclidean
        
    def _euclidean(self, x, y):
        return np.sqrt(np.sum((x - y)**2))
    
    def fit(self, X, y):
        self.feature = X
        self.label = y
        self.uniq = np.unique(y)
        
    def nearest(self, new_single):   ## find nearest indices
        train_dis = self.feature.apply(lambda x: self.dis(new_single, x), axis = 1)
        return train_dis.sort_values().index     #  return the index of nearest value, np.argsort() cannot handle when self.feature is shuffled
        
    def predict(self, new):
        if new.ndim == 1:
            #print(self.nearest(new))
            k_ind = self.nearest(new)[:self.k]
            k_label = self.label[k_ind]
            count = np.bincount(k_label, minlength = self.uniq.shape[0])
            #print(count)
            pre = np.argmax(count)
            return pre
        
        pre = np.ones(new.shape[0])
        for i in range(new.shape[0]):
            #print(new)
            k_ind = self.nearest(new.iloc[i])[:self.k]
            k_label = self.label[k_ind]
            #print(k_ind)
            count = np.bincount(k_label, minlength = self.uniq.shape[0])
            
            pre[i] = np.argmax(count)
            
        return pre    
        

In [4]:
K = KNN(int(round(np.sqrt(data.shape[0]))))
K.fit(data.drop(['Survived'],axis = 1), data.Survived)

## (b)

I use Euclidean distance, I think it's standard if we do not know the exact distribution of the data. The Euclidean distance might not be suitable for discrete data (especially  binary data). I think I can improve my classifier with better distance in future research.

## (c)

This is my own feature:

In [5]:
new = pd.Series({'Pclass':3, 'Sex':0, 'Age': 23, 'Siblings/Spouses Aboard': 1, 'Parents/Children Aboard':0, 'Fare': 0})

In [6]:
K.predict(new)

0

I would not survived the Titanic sinking based on the KNN prediction.

## (d)

I use square root of N as the choice of K, where N is the total number of samples. I implemented KNN with several K and the square root of N gave the best result.

In [7]:
int(round(np.sqrt(data.shape[0])))

30

## (e)

I used cross validation to assess the realiablility of my model. I divided my data into 10 folds and compute the accuracy on test sets 

In [8]:
def KNN_cross_val(data, fold = 10):
        indices = np.random.permutation(list(data.index))
        testsize = data.shape[0]//fold
        acc = []
        for i in range(fold):
            if i == fold - 1:
                test = data.loc[indices[i*testsize:]]
                train = data.loc[data.index.isin(indices[i*testsize:]) == False]
            else:
                test = data.loc[indices[i*testsize : (i+1)*testsize]]
                train = data.loc[data.index.isin(indices[i*testsize : (i+1)*testsize]) == False]
            
            #print(train.shape)
            K = KNN(k = 30)
            K.fit(train.drop(['Survived'],axis = 1), train.Survived)
            
            pre = K.predict(test.drop(['Survived'],axis = 1))
            
            acc.append((pre == test.Survived).sum()/len(test.Survived))
            
        #return np.array(acc).mean()
        return acc

In [11]:
acc = KNN_cross_val(data)
acc

[0.6931818181818182,
 0.6818181818181818,
 0.7386363636363636,
 0.7159090909090909,
 0.7613636363636364,
 0.6931818181818182,
 0.6818181818181818,
 0.7159090909090909,
 0.6931818181818182,
 0.7157894736842105]

In [12]:
np.array(acc).mean()

0.7090789473684211

I got accuracy around 70% in each fold. The model predict farely well on this data. In future work, I may improve the accuracy of my model by specifying the distance of each variable.

## 5.2

### (a)

In [13]:
class NaiveBayes:
    def __init__(self):
        return None
        
    def fit(self, data):
        self.data = data
        self.data0 = data[data.Survived == 0]
        self.data1 = data[data.Survived == 1]
        
        self.py0 = self.data0.shape[0]/self.data.shape[0]
        self.py1 = 1 - self.py0
        
        
        self.y0 = self.data0.Survived
        self.X0 = self.data0.drop(['Survived'],axis = 1)
        
        self.y1 = self.data1.Survived
        self.X1 = self.data1.drop(['Survived'],axis = 1)
        
        n0 = self.data0.shape[0]  ## sample size
        n1 = self.data1.shape[0]  ## sample size
        
        
        # Pclass [1,2,3]
        self.y0_Pclass = [(self.X0.Pclass == 1).sum()/n0]
        self.y0_Pclass.append((self.X0.Pclass == 2).sum()/n0)
        self.y0_Pclass.append((self.X0.Pclass == 3).sum()/n0)
        
        self.y1_Pclass = [(self.X1.Pclass == 1).sum()/n1]
        self.y1_Pclass.append((self.X1.Pclass == 2).sum()/n1)
        self.y1_Pclass.append((self.X1.Pclass == 3).sum()/n1)
        
        # Sex  [0,1]
        self.y0_Sex = [(self.X0.Sex == 0).sum()/n0]
        self.y0_Sex.append((self.X0.Sex == 1).sum()/n0)
        
        self.y1_Sex = [(self.X1.Sex == 1).sum()/n1]
        self.y1_Sex.append((self.X1.Sex == 0).sum()/n1)
        
        # Age [mu,sigma]
        self.y0_Age = [self.X0.Age.mean()]  # mu
        self.y0_Age.append(self.X0.Age.std(ddof = 0)) # sigma
        
        self.y1_Age = [self.X1.Age.mean()]
        self.y1_Age.append(self.X1.Age.std(ddof = 0))
        
        
        # Siblings/Spouses Aboard
        self.y0_Sib = (self.X0['Siblings/Spouses Aboard']).mean()  # lambda
        
        self.y1_Sib = (self.X1['Siblings/Spouses Aboard']).mean()  # lambda
        
        # Parents/Children Aboard
        self.y0_Par = (self.X0['Parents/Children Aboard']).mean()  # lambda
        
        self.y1_Par = (self.X1['Parents/Children Aboard']).mean()  # lambda
        
        # Fare [mu,sigma]
        self.y0_Fare = [self.X0.Fare.mean()]  # mu
        self.y0_Fare.append(self.X0.Fare.std(ddof = 0)) # sigma
        
        self.y1_Fare = [self.X1.Fare.mean()]  # mu
        self.y1_Fare.append(self.X1.Fare.std(ddof = 0)) # sigma
        
    def _predict(self, new = None):
        # compute prob of y0(just numerator)
        y0_prob = [self.py0]
        y0_prob.append(self.y0_Pclass[new.Pclass.astype(int) - 1])  # prob of Pclass
        y0_prob.append(self.y0_Sex[new.Sex.astype(int) - 1])  # prob of Sex
        y0_prob.append(1/self.y0_Age[1] * np.exp( - np.square(new.Age - self.y0_Age[0])/2/np.square(self.y0_Age[1])))  # prob of Age
        y0_prob.append(np.power(self.y0_Sib , new['Siblings/Spouses Aboard']) * np.exp(-self.y0_Sib) / \
                       np.math.factorial(new['Siblings/Spouses Aboard']))  # Siblings/Spouses Aboard
        
        y0_prob.append(np.power(self.y0_Par , new['Parents/Children Aboard']) * np.exp(-self.y0_Par) / \
                       np.math.factorial(new['Parents/Children Aboard']))  # Parents/Children Aboard
        
        y0_prob.append(1/self.y0_Fare[1] * np.exp( - np.square(new.Fare - self.y0_Fare[0])/2/np.square(self.y0_Fare[1])))  # prob of Fare
        
        # compute prob of y1(just numerator)
        y1_prob = [self.py1]
        y1_prob.append(self.y1_Pclass[new.Pclass.astype(int) - 1])  # prob of Pclass
        y1_prob.append(self.y1_Sex[new.Sex.astype(int) - 1])  # prob of Sex
        y1_prob.append(1/self.y1_Age[1] * np.exp( - np.square(new.Age - self.y1_Age[0])/2/np.square(self.y1_Age[1])))  # prob of Age
        y1_prob.append(np.power(self.y1_Sib , new['Siblings/Spouses Aboard']) * np.exp(-self.y1_Sib) / \
                       np.math.factorial(new['Siblings/Spouses Aboard']))  # Siblings/Spouses Aboard
        
        y1_prob.append(np.power(self.y1_Par , new['Parents/Children Aboard']) * np.exp(-self.y1_Par) / \
                       np.math.factorial(new['Parents/Children Aboard']))  # Parents/Children Aboard
        
        y1_prob.append(1/self.y1_Fare[1] * np.exp( - np.square(new.Fare - self.y1_Fare[0])/2/np.square(self.y1_Fare[1])))  # prob of Fare
        
        
        return  0 if np.prod(y0_prob) >= np.prod(y1_prob) else 1
        
    def predict(self, new = None):
        if new.ndim == 1:
            return self._predict(new)
        
        pre = np.ones(new.shape[0])
        for i in range(new.shape[0]):
            pre[i] = self._predict(new.iloc[i])
        return pre 
        

In [14]:
NB = NaiveBayes()
NB.fit(data)

### (b)

There are 3 values in **Pclass**, I assume them as multinomial distribution.

There are binary digits in **Sex**, I assume them as Bernoulli distribution.

The data in **age** range from 0.42 to 80, I assume them as normal distribution.

There are 9 values (0-8) in **Siblings/Spouses Aboard**, I assume them as poisson distribution.

There are 7 values (0-6) in **Parents/Children Aboard**, I assume them as poisson distribution.

The data in **Fare** range from 0 to 512.3, I assume them as normal distribution.

### (c)

This is my own feature:

In [15]:
new = pd.Series({'Pclass':3, 'Sex':0, 'Age': 23, 'Siblings/Spouses Aboard': 1, 'Parents/Children Aboard':0, 'Fare': 0})
new

Pclass                      3
Sex                         0
Age                        23
Siblings/Spouses Aboard     1
Parents/Children Aboard     0
Fare                        0
dtype: int64

In [16]:
NB.predict(new)

0

I would not survived the Titanic sinking based on the Naive Bayes prediction.

### (d)

I used cross validation to assess the realiablility of my model. I divided my data into 10 folds and compute the accuracy on test sets 

In [17]:
def NB_cross_val(data, fold = 10):
        indices = np.random.permutation(list(data.index))
        testsize = data.shape[0]//fold
        acc = []
        NB = NaiveBayes()
        for i in range(fold):
            if i == fold - 1:
                test = data.loc[indices[i*testsize:]]
                train = data.loc[data.index.isin(indices[i*testsize:]) == False]
            else:
                test = data.loc[indices[i*testsize : (i+1)*testsize]]
                train = data.loc[data.index.isin(indices[i*testsize : (i+1)*testsize]) == False]
            
            #print(train.shape)
            
            NB.fit(train)
            
            pre = NB.predict(test.drop(['Survived'],axis = 1))
            
            acc.append((pre == test.Survived).sum()/len(test.Survived))
            
        #return np.array(acc).mean()
        return acc

In [18]:
acc = NB_cross_val(data)
acc

[0.7272727272727273,
 0.625,
 0.6477272727272727,
 0.625,
 0.7045454545454546,
 0.625,
 0.6590909090909091,
 0.6022727272727273,
 0.7272727272727273,
 0.6210526315789474]

In [19]:
np.array(acc).mean()

0.6564234449760765

I got accuracy around 66% in each fold. The model predict farely well on this data. In future work, I may improve the accuracy of my model by specifying the distribution of each variable.

## 5.3

I prefer Random Forest and Decision Tree model, the random forest has the highest accuracy and is robust to different kinds of data. The random forest also does not require any strong assumption on the features. The accuracy of Random Forest can still be improved if we specify each feature carefully instead of just simplifying them into binary digits.

The Logistic regression trains data in a more statistical way, it assumes data as i.i.d Bernoulli distribution, has the explicit formula for log-likelihood, we use gradient descent to compute MLE. In this method, we can predict the probability of each new observation. But each feature has its own property and range, I think it would be better if I standardized each feature before training the data.

The KNN method is a lazy algorithm. The computation complexity grows drastically as the number of features growing (curse of dimensionality). Combining these 2 aspects, the training time would be extremely long. We also need to be careful in choosing the right K.

The Naive Bayes did not work so well on this dataset. It also requires data having independent features. One thing we need to be careful about is if one category in the test set did not show in the train set, the probability will zero out. We can add some regularization to avoid this.

In a nutshell, the choice of methods really depends on the data. We need to know the data well and use the method most suitable for the data.