# Question 1

(20 points) Design and implement an iterative Power Method approach to
determine the first principal component of the PCA transformation.

### Power method is a iterative method that will converge to the largest eigenvalue. 

In [31]:
import numpy as np
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import preprocessing

def converge(x):
    highest_comp = abs(x).max()
    x_n = x / x.max()
    highest_index = np.where(x==highest_comp)
    return highest_comp, highest_index, x_n


def power_method(a):
    
    x = np.ones(len(a))

    for i in range(10):

        x = np.dot(a, x)
        
        highest_value, highest_index, x = converge(x)
        #highest_value = Eigenvalue; x = Eigenvector
        retX = x

    return highest_value, highest_index, retX


def customPCA(A, debug=False):
    
    V = cov(A.T)

    highest_value, highest_index, vectors = power_method(V)

    P = vectors.T.dot(A.T)

    
    return highest_value, highest_index[0][0], vectors


columns_list = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
iris=pd.read_csv('iris.csv',header=0)
iris.columns=columns_list
iris=iris.drop(columns=['class'])

iris_scaled = preprocessing.scale(iris)
iris_scaled = pd.DataFrame(iris_scaled,columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
highest_value, highest_index, vectors = customPCA(iris_scaled)



print('Thus, it is the first principal component:')
print('The  Eigenvalue:', highest_value)
print('And its Eigenvector:',vectors)


Thus, it is the first principal component:
The  Eigenvalue: 2.9303514956384378
And its Eigenvector: [ 0.8987037  -0.45306463  1.          0.97308834]


# Question 2

(20 points) Design and implement an iterative Power Method
approach to determine the second principal component of the PCA
transformation.

In [35]:
def converge1(x):
    
    highest_comp = abs(x).max()
    
    x_n = x / x.max()
    
    
    return highest_comp, x_n


def power_method(a):

    #a = a- highest_value*np.ones(len(a))
    #print(a)
    
    x = np.ones(len(a))
    
    w,v=eig(a)

    for i in range(10):

        x = np.dot(a, x)
        
        second_value, x = converge1(x)
        #second_value = Eigenvalue; x = Eigenvector
        
        retX = x

    return second_value,  retX,w,v


def customPCA(A, debug=False):
    
    V = cov(A.T)

    second_value,  vectors,w,v = power_method(V)

    P = vectors.T.dot(A.T)

    
    return second_value, vectors,w,v




columns_list = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
iris=pd.read_csv('iris.csv',header=0)
iris.columns=columns_list
iris=iris.drop(columns=['class'])

iris_scaled = preprocessing.scale(iris)
iris_scaled = pd.DataFrame(iris_scaled,columns=['sepal length', 'sepal width', 'petal length', 'petal width'])

second_value,  vectors,w,v = customPCA(iris_scaled)

print('Thus, it is the second principal component:')
print('The  Eigenvalue:', w[1])
print('And its Eigenvector:',v[1])


Thus, it is the second principal component:
The  Eigenvalue: 0.9274036215173421
And its Eigenvector: [-0.26335492 -0.92555649  0.24203288 -0.12413481]


# Problem Number 3

 (10 points) Describe the benefit of a gradient approach w.r.t.
the SVD approach for the PCA transformation

#### There are some reasons:

1. PCA wants to do the eigen-decomposition of X.T * X to get the d's biggest eigen vectors, and SVD can calculate these. In scikit-learn, SVD is a part in PCA process.
2. When the dataset is large, it is computing-comsuming to calculate the covariance matrix in PCA. Using SVD would be much faster than simply calculate the eigrn-decomposition of X.T * X.
3. SVD is more accurate than eigenvalue Decomposition of covariance matrix.

# Question 4

(20 points) Using a binary classifier (logistic regression or
SVM) please implement in python both of the 2 different approaches
(one vs. one, one vs. all) to handle the problem of more
than two different classes (use the iris dataset). For
reference use the following link
https://en.wikipedia.org/wiki/Multiclass_classification


In [36]:
columns_list = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
iris=pd.read_csv('iris.csv',header=0)
iris.columns=columns_list
y = iris['class']
iris=iris.drop(columns=['class'])

scaled = preprocessing.scale(iris)
iris_scaled = pd.DataFrame(scaled,columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
iris_scaled['class'] = y
iris_scaled

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,-0.900681,1.032057,-1.341272,-1.312977,Iris-setosa
1,-1.143017,-0.124958,-1.341272,-1.312977,Iris-setosa
2,-1.385353,0.337848,-1.398138,-1.312977,Iris-setosa
3,-1.506521,0.106445,-1.284407,-1.312977,Iris-setosa
4,-1.021849,1.263460,-1.341272,-1.312977,Iris-setosa
...,...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956,Iris-virginica
146,0.553333,-1.281972,0.705893,0.922064,Iris-virginica
147,0.795669,-0.124958,0.819624,1.053537,Iris-virginica
148,0.432165,0.800654,0.933356,1.447956,Iris-virginica


### One vs. all (Rest)

The One-vs-Rest strategy splits a multi-class classification into multiple binary classification problem per class. 

- Binary Classification Problem 1: class1 vs rest
- Binary Classification Problem 2: class2 vs rest
- Binary Classification Problem 3: class3 vs rest

...

In [37]:
app_df = iris_scaled.copy()
probb={}
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def oneVSAll(df, debug = False):
#     X_list = []
#     y_list = []
    list_class = {}
    for i_class in set(y): # Loop through all class
        
        app_df = df.copy()
        app_df.loc[app_df['class']!=i_class,'class'] = 0
        app_df.loc[app_df['class']==i_class,'class'] = 1

        
        

        clf = svm.SVC(C=1,probability=True)
        X = app_df.drop(columns=['class'])
        Y = app_df['class'].astype('int')
        clf.fit(X, Y)
        y_hat = clf.predict(X)
        
        prob=clf.predict_proba(X)
        probb[i_class]=prob
        
        list_class[i_class]=y_hat
    print(probb)    
    return list_class

result = oneVSAll(iris_scaled)
result


{'Iris-versicolor': array([[9.97171744e-01, 2.82825634e-03],
       [9.92112407e-01, 7.88759347e-03],
       [9.97293845e-01, 2.70615510e-03],
       [9.95686390e-01, 4.31361033e-03],
       [9.97569119e-01, 2.43088091e-03],
       [9.91307238e-01, 8.69276171e-03],
       [9.97449827e-01, 2.55017256e-03],
       [9.96806610e-01, 3.19338998e-03],
       [9.94018414e-01, 5.98158613e-03],
       [9.95046929e-01, 4.95307052e-03],
       [9.95159410e-01, 4.84059015e-03],
       [9.97062175e-01, 2.93782538e-03],
       [9.94848312e-01, 5.15168812e-03],
       [9.97070110e-01, 2.92988958e-03],
       [9.91494142e-01, 8.50585828e-03],
       [9.85442124e-01, 1.45578762e-02],
       [9.94145269e-01, 5.85473076e-03],
       [9.96525408e-01, 3.47459225e-03],
       [9.86141714e-01, 1.38582864e-02],
       [9.96155797e-01, 3.84420255e-03],
       [9.88942239e-01, 1.10577607e-02],
       [9.95613676e-01, 4.38632398e-03],
       [9.98156080e-01, 1.84391967e-03],
       [9.80140616e-01, 1.98593837e-0

{'Iris-versicolor': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'Iris-virginica': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 

#### Thus, combining the three classifiers, we can get the predictied result of multi-classifier.

In [38]:
y_hat=[]
for i in range(len(y)):

    if result['Iris-versicolor'][i]==1:
        
        y_hat.append('Iris-versicolor')

    elif result['Iris-virginica'][i]==1:
        y_hat.append('Iris-virginica')

    elif result['Iris-setosa'][i]==1:
        y_hat.append('Iris-setosa')

pd.DataFrame(y_hat,columns={'y_hat'})

Unnamed: 0,y_hat
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa
...,...
145,Iris-virginica
146,Iris-virginica
147,Iris-virginica
148,Iris-virginica


### One vs. One

The One-vs-One strategy splits a multi-class classification into one binary classification problem per each pair of classes.

Unlike one-vs-rest that splits it into one binary dataset for each class, the one-vs-one approach splits the dataset into one dataset for each class versus every other class.

- Binary Classification Problem 1: class 1 vs. class 2
- Binary Classification Problem 2: class 1 vs. class 3

...

- Binary Classification Problem k-1: class 1 vs. class k
- Binary Classification Problem k: class 2 vs. class 3

...

- Binary Classification Problem k(k-1)/2: class k-1 vs. class k

In [47]:
columns_list = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
iris=pd.read_csv('iris.csv',header=0)
iris.columns=columns_list
y = iris['class']
iris=iris.drop(columns=['class'])

scaled = preprocessing.scale(iris)
iris_scaled = pd.DataFrame(scaled,columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
iris_scaled['class'] = y
iris_scaled


#print(len(set(y)))
y_hat=[]
indexdic={}
kinddic={}
app_df = iris_scaled.copy()
#print(list(set(y))[0])
list_class={}

for i in range(len(set(y))-1):
#     print('i=',i)
    for j in range(i+1,len(set(y))):
#         print('j=',j)
        
        index_i=app_df.loc[app_df['class']==list(set(y))[i]].index
#         print(index_i)
        index_j=app_df.loc[app_df['class']==list(set(y))[j]].index
#         print(index_j)
     #print(index_i.append(index_j))
        
        class1=app_df.loc[index_i]
        class1.loc[class1['class']==list(set(y))[i],'class']=0
        #print(list(set(y))[i],list(set(y))[j])
        class2=app_df.loc[index_j]
        class2.loc[class2['class']==list(set(y))[j],'class']=1
#         print(class1)
#         print(class2)
        
        clf = svm.SVC(C=1,probability=True)
        
        data=pd.concat([class1,class2])
        
        #data=class1.append(class2)
#         print(data)
        
        X = data.drop(columns=['class'])
        
        Y = data['class'].astype('int')
        #print(data)
        clf.fit(X, Y)
        
        #print(index_i.append(index_j))
        y_proba = clf.predict_proba(X)
        
#         X['y_hat']=y_hat
#         testdf = pd.DataFrame(index_i.append(index_j))
#         print(testdf)
#         print(y_hat)
#         testdf["yhat"] = y_hat
#         print(testdf)
        
#         pd.merge(app_df,y_hat,how='left',on=)
        
        list_class[i+j]=y_proba
        indexdic[i+j]=index_i.append(index_j)
        kinddic[i+j]={list(set(y))[i],list(set(y))[j]}
        

print('The class chosed of One vs. One in each binary classifier: ')

print(kinddic)
print('=============================')
print('The index chosed of One vs. One in each binary classifier: ')
print(indexdic)

print('=============================')
print('The probability of each class of One vs. One in each binary classifier: ') 
print(list_class)
       

The class chosed of One vs. One in each binary classifier: 
{1: {'Iris-versicolor', 'Iris-virginica'}, 2: {'Iris-versicolor', 'Iris-setosa'}, 3: {'Iris-setosa', 'Iris-virginica'}}
The index chosed of One vs. One in each binary classifier: 
{1: Int64Index([ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
             63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
             76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
             89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
            102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
            115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
            128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
            141, 142, 143, 144, 145, 146, 147, 148, 149],
           dtype='int64'), 2: Int64Index([50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
            67, 68, 69, 70, 71, 72, 73, 74, 75, 7

#### For this iris case, we combine the results from one vs. one together ( add the probability together) and get the final result.

In [48]:
indexdic[1]

Int64Index([ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
             63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
             76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
             89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
            102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
            115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
            128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
            141, 142, 143, 144, 145, 146, 147, 148, 149],
           dtype='int64')

In [49]:
clf1=pd.DataFrame(list_class[1],columns=kinddic[1],index=indexdic[1])
clf2=pd.DataFrame(list_class[2],columns=kinddic[2],index=indexdic[2])
clf3=pd.DataFrame(list_class[3],columns=kinddic[3],index=indexdic[3])

clf12=pd.merge(clf1,clf2,how='outer',left_index=True,right_index=True)
clf123=pd.merge(clf12,clf3,how='outer',left_index=True, right_index=True)


clf123

Unnamed: 0,Iris-versicolor_x,Iris-virginica_x,Iris-versicolor_y,Iris-setosa_x,Iris-setosa_y,Iris-virginica_y
0,,,0.011472,0.988528,0.014710,0.985290
1,,,0.015713,0.984287,0.017100,0.982900
2,,,0.011735,0.988265,0.016283,0.983717
3,,,0.014149,0.985851,0.017887,0.982113
4,,,0.012427,0.987573,0.014803,0.985197
...,...,...,...,...,...,...
145,0.006669,0.993331,,,0.985758,0.014242
146,0.058772,0.941228,,,0.985482,0.014518
147,0.023150,0.976850,,,0.985432,0.014568
148,0.046941,0.953059,,,0.972237,0.027763


In [50]:
clf123.loc[ pd.isnull(clf123['Iris-versicolor_x'] ),'Iris-versicolor_x']=0
clf123.loc[ pd.isnull(clf123['Iris-virginica_x'] ),'Iris-virginica_x']=0
clf123.loc[ pd.isnull(clf123['Iris-setosa_x'] ),'Iris-setosa_x']=0
clf123.loc[ pd.isnull(clf123['Iris-versicolor_y'] ),'Iris-versicolor_y']=0
clf123.loc[ pd.isnull(clf123['Iris-virginica_y'] ),'Iris-virginica_y']=0
clf123.loc[ pd.isnull(clf123['Iris-setosa_y'] ),'Iris-setosa_y']=0

clf123

Unnamed: 0,Iris-versicolor_x,Iris-virginica_x,Iris-versicolor_y,Iris-setosa_x,Iris-setosa_y,Iris-virginica_y
0,0.000000,0.000000,0.011472,0.988528,0.014710,0.985290
1,0.000000,0.000000,0.015713,0.984287,0.017100,0.982900
2,0.000000,0.000000,0.011735,0.988265,0.016283,0.983717
3,0.000000,0.000000,0.014149,0.985851,0.017887,0.982113
4,0.000000,0.000000,0.012427,0.987573,0.014803,0.985197
...,...,...,...,...,...,...
145,0.006669,0.993331,0.000000,0.000000,0.985758,0.014242
146,0.058772,0.941228,0.000000,0.000000,0.985482,0.014518
147,0.023150,0.976850,0.000000,0.000000,0.985432,0.014568
148,0.046941,0.953059,0.000000,0.000000,0.972237,0.027763


In [51]:
clf123['Iris-versicolor']=clf123['Iris-versicolor_x']+clf123['Iris-versicolor_y']
clf123['Iris-virginica']=clf123['Iris-virginica_x']+clf123['Iris-virginica_y']
clf123['Iris-setosa']=clf123['Iris-setosa_x']+clf123['Iris-setosa_y']
multiclf=clf123.drop(columns={'Iris-versicolor_x','Iris-versicolor_y','Iris-virginica_x',
                              'Iris-virginica_y','Iris-setosa_x','Iris-setosa_y'})


multiclf

Unnamed: 0,Iris-versicolor,Iris-virginica,Iris-setosa
0,0.011472,0.985290,1.003238
1,0.015713,0.982900,1.001386
2,0.011735,0.983717,1.004548
3,0.014149,0.982113,1.003738
4,0.012427,0.985197,1.002376
...,...,...,...
145,0.006669,1.007572,0.985758
146,0.058772,0.955746,0.985482
147,0.023150,0.991418,0.985432
148,0.046941,0.980822,0.972237


In [52]:
# multiclf.loc[ pd.isnull(multiclf['Iris-versicolor'] ),'Iris-versicolor']=0
# multiclf.loc[ pd.isnull(multiclf['Iris-virginica'] ),'Iris-virginica']=0
# multiclf.loc[ pd.isnull(multiclf['Iris-setosa'] ),'Iris-setosa']=0

# multiclf


In [53]:
multiclf['y_hat']=multiclf.idxmax(axis=1)

### The multiclassifer results:

In [54]:
multiclf

Unnamed: 0,Iris-versicolor,Iris-virginica,Iris-setosa,y_hat
0,0.011472,0.985290,1.003238,Iris-setosa
1,0.015713,0.982900,1.001386,Iris-setosa
2,0.011735,0.983717,1.004548,Iris-setosa
3,0.014149,0.982113,1.003738,Iris-setosa
4,0.012427,0.985197,1.002376,Iris-setosa
...,...,...,...,...
145,0.006669,1.007572,0.985758,Iris-virginica
146,0.058772,0.955746,0.985482,Iris-setosa
147,0.023150,0.991418,0.985432,Iris-virginica
148,0.046941,0.980822,0.972237,Iris-virginica


# Question 5

(20 points) Extensively describe the overfitting and
underfitting problem. Use execution examples with a decision
tree and SVM (with or without kernel). Use the scikit
implementations. Show underfitting, good behavior, and overfitting examples.

Solution: 

**Overfitting** occurs when a model is too closely aligned to a limited set of data points. As a result, the model is useful in reference only to its initial data set, and not applicable to any other data sets.

**Underfitting** occurs when a model is unable to learn much from the training data. Therefore, the results of the application of the model in the training data or any other data sets are both not good.

Because iris dataset is too small, I use the titanic data set to show the examples of overfitting, good behavior and underfitting.

In [24]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
df_train = pd.read_csv('train.csv')
df_comb = df_train
X = pd.DataFrame()

def encode_sex(x):
    return 1 if x == 'female' else 0

def family_size(x):
    size = x.SibSp + x.Parch 
    return 4 if size > 3 else size

X['Sex'] = df_comb.Sex.map(encode_sex)
X['Pclass'] = df_comb.Pclass
X['FamilySize'] = df_comb.apply(family_size, axis=1)
fare_median = df_train.groupby(['Sex', 'Pclass']).Fare.median()
fare_median.name = 'FareMedian'

age_mean = df_train.groupby(['Sex', 'Pclass']).Age.mean()
age_mean.name = 'AgeMean'

def join(df, stat):
    return pd.merge(df, stat.to_frame(), left_on=['Sex', 'Pclass'], right_index=True, how='left')

X['Fare'] = df_comb.Fare.fillna(join(df_comb, fare_median).FareMedian)
X['Age'] = df_comb.Age.fillna(join(df_comb, age_mean).AgeMean)
def quantiles(series, num):
    return pd.qcut(series, num, retbins=True)[1]

def discretize(series, bins):
    return pd.cut(series, bins, labels=range(len(bins)-1), include_lowest=True)
    
X['Fare'] = discretize(X.Fare, quantiles(df_comb.Fare, 10))
X['Age'] = discretize(X.Age, quantiles(df_comb.Age, 10))
y=df_train.Survived

## SVM

In [25]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=424)

from sklearn import svm
from sklearn.metrics import mean_squared_error,accuracy_score,precision_score,classification_report,confusion_matrix


clf=svm.SVC( C=0.00000001)
clf.fit(X_train,y_train)
y_hat=clf.predict(X_test)
X_hat=clf.predict(X_train)
print('The underfitting situation:')
print('The accuracy in Training data set is:',accuracy_score(y_train,X_hat))
print('The accuracy in Testing data set is:',accuracy_score(y_test,y_hat))
print('The result show that the accuracy in both Training data and Testing data are both low, shows the model is underfitting')


clf=svm.SVC( C=1000)
clf.fit(X_train,y_train)
y_hat=clf.predict(X_test)
X_hat=clf.predict(X_train)
print('The overfitting situation:')
print('The accuracy in Training data set is:',accuracy_score(y_train,X_hat))
print('The accuracy in Testing data set is:',accuracy_score(y_test,y_hat))
print('The result show that the accuracy in Training data is much higher than in test data, that means the model is overfitting')


clf=svm.SVC( C=1)
clf.fit(X_train,y_train)
y_hat=clf.predict(X_test)
X_hat=clf.predict(X_train)
print('The good behavior situation:')
print('The accuracy in Training data set is:',accuracy_score(y_train,X_hat))
print('The accuracy in Testing data set is:',accuracy_score(y_test,y_hat))
print('The result show that the accuracy in Training data is similar to test data, that means the model is in good behavior')

The underfitting situation:
The accuracy in Training data set is: 0.6067415730337079
The accuracy in Testing data set is: 0.6302521008403361
The result show that the accuracy in both Training data and Testing data are both low, shows the model is underfitting
The overfitting situation:
The accuracy in Training data set is: 0.8857677902621723
The accuracy in Testing data set is: 0.8179271708683473
The result show that the accuracy in Training data is much higher than in test data, that means the model is overfitting
The good behavior situation:
The accuracy in Training data set is: 0.8295880149812734
The accuracy in Testing data set is: 0.7815126050420168
The result show that the accuracy in Training data is similar to test data, that means the model is in good behavior


## Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier
clf1 = DecisionTreeClassifier(random_state=0,max_depth=1000,max_leaf_nodes=1000)
clf1.fit(X_train,y_train)
y_hat=clf1.predict(X_test)
X_hat=clf1.predict(X_train)
print('The overfitting situation:')
print('The accuracy in Training data set is:',accuracy_score(y_train,X_hat))
print('The accuracy in Testing data set is:',accuracy_score(y_test,y_hat))
print('The result show that the accuracy in Training data is much higher than in test data, that means the model is overfitting')


clf1 = DecisionTreeClassifier(random_state=0,max_depth=1,max_leaf_nodes=2)
clf1.fit(X_train,y_train)
y_hat=clf1.predict(X_test)
X_hat=clf1.predict(X_train)
print('The underfitting situation:')
print('The accuracy in Training data set is:',accuracy_score(y_train,X_hat))
print('The accuracy in Testing data set is:',accuracy_score(y_test,y_hat))
print('The result show that the accuracy in both Training data and Testing data are both low, shows the model is underfitting')


clf1 = DecisionTreeClassifier(random_state=0,max_depth=10,max_leaf_nodes=25)
clf1.fit(X_train,y_train)
y_hat=clf1.predict(X_test)
X_hat=clf1.predict(X_train)
print('The good behavior situation:')
print('The accuracy in Training data set is:',accuracy_score(y_train,X_hat))
print('The accuracy in Testing data set is:',accuracy_score(y_test,y_hat))
print('The result show that the accuracy in Training data is similar to test data, that means the model is in good behavior')

The overfitting situation:
The accuracy in Training data set is: 0.9194756554307116
The accuracy in Testing data set is: 0.7675070028011205
The result show that the accuracy in Training data is much higher than in test data, that means the model is overfitting
The underfitting situation:
The accuracy in Training data set is: 0.7921348314606742
The accuracy in Testing data set is: 0.7787114845938375
The result show that the accuracy in both Training data and Testing data are both low, shows the model is underfitting
The good behavior situation:
The accuracy in Training data set is: 0.8838951310861424
The accuracy in Testing data set is: 0.8319327731092437
The result show that the accuracy in Training data is similar to test data, that means the model is in good behavior


# Question 6
(10 points) Show examples when the use of
kernel procedure is more efficient in terms of training and
prediction computational time w.r.t. polynomial features
transformation

In [56]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
df_train = pd.read_csv('train.csv')
df_comb = df_train
X = pd.DataFrame()

def encode_sex(x):
    return 1 if x == 'female' else 0

def family_size(x):
    size = x.SibSp + x.Parch 
    return 4 if size > 3 else size

X['Sex'] = df_comb.Sex.map(encode_sex)
X['Pclass'] = df_comb.Pclass
X['FamilySize'] = df_comb.apply(family_size, axis=1)
fare_median = df_train.groupby(['Sex', 'Pclass']).Fare.median()
fare_median.name = 'FareMedian'

age_mean = df_train.groupby(['Sex', 'Pclass']).Age.mean()
age_mean.name = 'AgeMean'

def join(df, stat):
    return pd.merge(df, stat.to_frame(), left_on=['Sex', 'Pclass'], right_index=True, how='left')

X['Fare'] = df_comb.Fare.fillna(join(df_comb, fare_median).FareMedian)
X['Age'] = df_comb.Age.fillna(join(df_comb, age_mean).AgeMean)
def quantiles(series, num):
    return pd.qcut(series, num, retbins=True)[1]

def discretize(series, bins):
    return pd.cut(series, bins, labels=range(len(bins)-1), include_lowest=True)
    
X['Fare'] = discretize(X.Fare, quantiles(df_comb.Fare, 10))
X['Age'] = discretize(X.Age, quantiles(df_comb.Age, 10))
y=df_train.Survived
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=424)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

from sklearn import svm
import time

start = time.time()
clf1=svm.SVC(kernel='poly')
clf1.fit(X_train,y_train)
clf1.predict(X_test)
end = time.time()
print('Time needed for Poly Kernel:',end-start)

# start = time.time()
# clf1=svm.SVC(kernel='rbf')
# clf1.fit(X_train,y_train)
# clf1.predict(X_test)
# end = time.time()
# print(end-start)

# start = time.time()
# clf1=svm.SVC(kernel='linear')
# clf1.fit(X_train,y_train)
# clf1.predict(X_test)
# end = time.time()
# print(end-start)

start = time.time()
clf1=svm.SVC()
clf1.fit(X_train,y_train)
clf1.predict(X_test)
end = time.time()
print('Time needed for none Kernel:',end-start)

Time needed for Poly Kernel: 0.007203102111816406
Time needed for none Kernel: 0.012806177139282227


#### It shows that the use of kernel procedure is more efficient in terms of training and prediction computational time w.r.t. polynomial features transformation.

# Question 7

(10 points) Write a procedure to estimate in the SVC
classifier (in Scikit) the best kernel (RBF, Polynomial,
sigmoid), the best gamma & degree, and the best C. Use the
grid search without implement it. Use the following
reference:

    a. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
    
    b. https://scikit-learn.org/stable/modules/grid_search.html

In [28]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':( 'rbf','poly','sigmoid'), 
              'C':[0.1,1,10,100],
              'gamma':('scale', 'auto'),
              'degree':[1,5,10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'degree': [1, 5, 10],
                         'gamma': ('scale', 'auto'),
                         'kernel': ('rbf', 'poly', 'sigmoid')})

In [29]:
#sorted(clf.cv_results_.keys())
clf.best_params_
#help(svm.SVC())

{'C': 10, 'degree': 1, 'gamma': 'scale', 'kernel': 'poly'}

 # Question 8
 (20 points) Create examples to explain the property and the
importance of the following kernels:
    
    a. https://en.wikipedia.org/wiki/Graph_kernel
    
    b. https://en.wikipedia.org/wiki/String_kernel

    c. https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.sigmoid_kernel.html

### Graph Kernel

A graph kernel is a kernel function that computes the inner function of graphs.

### String_kernel

A string kernel is a kernel function that operates on strings.

### sigmoid_kernel

 The sigmoid kernel is also known as hyperbolic tangent, or Multilayer Perceptron, Because it always be used as an activation function.

In [58]:
#X_train and y_train are from titanic dataset.
clf1=svm.SVC(kernel='sigmoid')
clf1.fit(X_train,y_train)
clf1.predict(X_test)

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,