### Import necessary libraries and read data

In [1]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Feature Engineering

#### check if we need data or not (if there is some types of Ticket we should consider this column and it could affect our result)

In [2]:
df['Ticket'].head(20)

0            A/5 21171
1             PC 17599
2     STON/O2. 3101282
3               113803
4               373450
5               330877
6                17463
7               349909
8               347742
9               237736
10             PP 9549
11              113783
12           A/5. 2151
13              347082
14              350406
15              248706
16              382652
17              244373
18              345763
19                2649
Name: Ticket, dtype: object

#### remove columns that have no effect on the final result

In [3]:
df.drop('Name',inplace = True,axis = 1)
df.drop('Ticket',inplace = True,axis = 1)
df.drop('PassengerId',inplace = True,axis = 1)

#### check in which columns we have nan datas and if true , how many they are

In [4]:
df['Survived'].isnull().values.any()

False

In [5]:
df['Pclass'].isnull().values.any()

False

In [6]:
df['Age'].isnull().values.any()

True

##### here we have 177 datas out of 891 without age,i use interpolate to fill them

In [7]:
df['Age'].isna().sum()

177

In [8]:
df['Age'] = df['Age'].interpolate()

In [9]:
df['SibSp'].isnull().values.any()

False

In [10]:
df['Parch'].isnull().values.any()

False

In [11]:
df['Sex'].isnull().values.any()

False

In [12]:
df['Fare'].isnull().values.any()

False

In [13]:
df['Cabin'].isnull().values.any()

True

##### most of cabin values are missing so we should drop this column too

In [14]:
df['Cabin'].isna().sum()

687

In [15]:
df.drop('Cabin',inplace = True,axis = 1)

##### change sexes values to 1 as male and 0 as female

In [16]:
df['Sex'] = df['Sex'].replace({"male": 1, "female": 0})

In [17]:
df['Embarked'].isnull().values.any()

True

##### number of missing values in Embarked column is 2 so i used one-hotting (change categorical data to numeric)

In [18]:
df['Embarked'].isna().sum()

2

In [19]:
df = pd.get_dummies(df)

##### fare range is from 0 to almost 513 so we can't normalize it.

In [20]:
df['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [21]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.2500,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.9250,0,0,1
3,1,1,0,35.0,1,0,53.1000,0,0,1
4,0,3,1,35.0,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,0,0,1
887,1,1,0,19.0,0,0,30.0000,0,0,1
888,0,3,0,22.5,1,2,23.4500,0,0,1
889,1,1,1,26.0,0,0,30.0000,1,0,0


#### initialize target and features data and split our train data to train and valid

In [22]:
X = np.array(df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked_C','Embarked_Q','Embarked_S']])
y = np.array(df['Survived'])

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.20)

#### read test data and see its format

In [24]:
dftest = pd.read_csv("test.csv")
dftest.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


#### like before we should clean it

In [25]:
dftest.drop('Name',inplace = True,axis = 1)
dftest.drop('Ticket',inplace = True,axis = 1)
dftest.drop('PassengerId',inplace = True,axis = 1)
dftest.drop('Cabin',inplace = True,axis = 1)

In [26]:
dftest['Pclass'].isnull().values.any()

False

In [27]:
dftest['Age'].isnull().values.any()

True

In [28]:
dftest['Age'] = dftest['Age'].interpolate()

In [29]:
dftest['Pclass'].isnull().values.any()

False

In [30]:
dftest['Sex'].isnull().values.any()

False

In [31]:
dftest['SibSp'].isnull().values.any()

False

In [32]:
dftest['Parch'].isnull().values.any()

False

In [33]:
dftest['Fare'].isnull().values.any()

True

In [34]:
avg = dftest['Fare'].mean()
dftest['Fare'] = dftest['Fare'].fillna(avg)

In [35]:
dftest['Embarked'].isnull().values.any()

False

In [36]:
dftest['Sex'] = dftest['Sex'].replace({"male": 1, "female": 0})

In [37]:
dftest = pd.get_dummies(dftest)
dftest

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0000,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1
...,...,...,...,...,...,...,...,...,...
413,3,1,33.5,0,0,8.0500,0,0,1
414,1,0,39.0,0,0,108.9000,1,0,0
415,3,1,38.5,0,0,7.2500,0,0,1
416,3,1,38.5,0,0,8.0500,0,0,1


### KNN

#### make an array with size of train data to use in dist matrix to get the indexes of train data

In [38]:
x = list(range(0,712))
x = np.array(x)

#### define dist function to compute Euclidean distant

In [39]:
import math
def dist(x,y,length):
    d = 0
    for i in range(length):
        d += pow(x[i]-y[i],2)
    return math.sqrt(d)
    

#### define knn function: first make an empty array of size 712 to store distance of each training data and test data.
#### then pass x array as col_train that we made before as indexes .then in a for loop we compute the distance and append it to the col_dist
#### then append col_dist to alldist matrix as distances

In [40]:
def knn(X_train,point,k):
    alldist = np.empty((712, 0), float)
    col_train = x
    alldist = np.append(alldist, np.array([x]).transpose(), axis=1)
    col_dist = []
    for i in range(len(X_train)):
        col_dist.append(dist(X_train[i],point,len(X_train[i])))
        
    alldist = np.append(alldist, np.array([col_dist]).transpose(), axis=1)
    return alldist

#### define X_test with wanted series

In [41]:
X_test = np.array(dftest[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked_C','Embarked_Q','Embarked_S']])
print(len(X_test))
print(len(X_train))

418
712


#### now we want to validate our model,we should use knn function to compute the distances for each X_valid and get the distance matrix.
#### then we should sort our matrix to see the minimum distances (nearest neighbours) at top.
#### then for a wanted k (that it should be odd) we'll find the maximum count answer(that it's y_train for corresponding X_train)
#### we should use method np.bincount to get that(for example: if we have 1 1 0 1 0 as y_train for nearest neighbours then answer is 1 cuz of the most frequent item is 1!)

In [42]:
k = 5
ypredknnval = []
for i in range(len(X_valid)):
    distpoint = knn(X_train,X_valid[i],5)
    sortedArr = distpoint[distpoint[:,1].argsort()] 
    y = []
    y_predknn = 0
    for j in range(k):
        y.append(y_train[int(sortedArr[:k,0][j])])
        
    y_predknn = np.bincount(y).argmax()
    ypredknnval.append(y_predknn)
    print('validpred ',i,' real_val: ',y_valid[i],' , predicted_val: ',y_predknn)


validpred  0  real_val:  1  , predicted_val:  1
validpred  1  real_val:  1  , predicted_val:  0
validpred  2  real_val:  0  , predicted_val:  0
validpred  3  real_val:  1  , predicted_val:  0
validpred  4  real_val:  0  , predicted_val:  0
validpred  5  real_val:  0  , predicted_val:  0
validpred  6  real_val:  0  , predicted_val:  0
validpred  7  real_val:  0  , predicted_val:  0
validpred  8  real_val:  1  , predicted_val:  0
validpred  9  real_val:  1  , predicted_val:  0
validpred  10  real_val:  1  , predicted_val:  0
validpred  11  real_val:  1  , predicted_val:  1
validpred  12  real_val:  0  , predicted_val:  0
validpred  13  real_val:  0  , predicted_val:  0
validpred  14  real_val:  1  , predicted_val:  1
validpred  15  real_val:  0  , predicted_val:  0
validpred  16  real_val:  0  , predicted_val:  0
validpred  17  real_val:  0  , predicted_val:  0
validpred  18  real_val:  1  , predicted_val:  1
validpred  19  real_val:  1  , predicted_val:  0
validpred  20  real_val:  1  ,

#### loss function to check the validation if it's small,it's ok

In [43]:
print(abs(np.mean(y_valid-ypredknnval)))

0.055865921787709494


#### do the same thing like before for test.

In [44]:
k = 5
ypredknntest = []
for i in range(len(X_test)):
    distpoint = knn(X_train,X_test[i],5)
    sortedArr = distpoint[distpoint[:,1].argsort()]  
    y = []
    y_predknn = 0
    for j in range(k):
        y.append(y_train[int(sortedArr[:k,0][j])])
        
    y_predknn = np.bincount(y).argmax()
    ypredknntest.append(y_predknn)
    print('testpred ',i,' : ',y_predknn)


testpred  0  :  0
testpred  1  :  0
testpred  2  :  0
testpred  3  :  0
testpred  4  :  0
testpred  5  :  1
testpred  6  :  0
testpred  7  :  1
testpred  8  :  0
testpred  9  :  1
testpred  10  :  0
testpred  11  :  1
testpred  12  :  1
testpred  13  :  0
testpred  14  :  1
testpred  15  :  1
testpred  16  :  0
testpred  17  :  0
testpred  18  :  1
testpred  19  :  0
testpred  20  :  1
testpred  21  :  0
testpred  22  :  0
testpred  23  :  1
testpred  24  :  1
testpred  25  :  0
testpred  26  :  1
testpred  27  :  0
testpred  28  :  1
testpred  29  :  0
testpred  30  :  1
testpred  31  :  1
testpred  32  :  0
testpred  33  :  1
testpred  34  :  1
testpred  35  :  0
testpred  36  :  0
testpred  37  :  0
testpred  38  :  0
testpred  39  :  1
testpred  40  :  0
testpred  41  :  0
testpred  42  :  0
testpred  43  :  0
testpred  44  :  1
testpred  45  :  0
testpred  46  :  1
testpred  47  :  0
testpred  48  :  1
testpred  49  :  1
testpred  50  :  1
testpred  51  :  0
testpred  52  :  1
tes

#### define a new column to show the predicted values by knn algo

In [45]:
dftest['KnnSurvived'] = np.array(ypredknntest)

In [46]:
dftest

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,KnnSurvived
0,3,1,34.5,0,0,7.8292,0,1,0,0
1,3,0,47.0,1,0,7.0000,0,0,1,0
2,2,1,62.0,0,0,9.6875,0,1,0,0
3,3,1,27.0,0,0,8.6625,0,0,1,0
4,3,0,22.0,1,1,12.2875,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
413,3,1,33.5,0,0,8.0500,0,0,1,0
414,1,0,39.0,0,0,108.9000,1,0,0,1
415,3,1,38.5,0,0,7.2500,0,0,1,0
416,3,1,38.5,0,0,8.0500,0,0,1,0


### SVM

#### from sklearn we import needed functions to fit our model and predict the x_valid at first for validation.
#### compute the loss function for validation
#### then try to know the accuracy of training and validating.(that we see they are almost in a range)

In [47]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train,y_train)
y_pred = model.predict(X_valid)

print('loss for valid data: ')
print(abs(np.mean(y_pred - y_valid)))

print('accuracy score(training): ',model.score(X_train,y_train))
print(f'accuracy score(validation): ',model.score(X_valid,y_valid))

loss for valid data: 
0.24022346368715083
accuracy score(training):  0.6811797752808989
accuracy score(validation):  0.6703910614525139


#### try to predict using predict method in SVM and print it

In [48]:
ypred_test = model.predict(X_test)
print(ypred_test)

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1
 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 1 0 0 0 1 0 0 1 0 0 0]


#### define a new column to show the predicted values by SVM algo

In [49]:
dftest['SvmSurvived'] = ypred_test.tolist()
dftest

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,KnnSurvived,SvmSurvived
0,3,1,34.5,0,0,7.8292,0,1,0,0,0
1,3,0,47.0,1,0,7.0000,0,0,1,0,0
2,2,1,62.0,0,0,9.6875,0,1,0,0,0
3,3,1,27.0,0,0,8.6625,0,0,1,0,0
4,3,0,22.0,1,1,12.2875,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
413,3,1,33.5,0,0,8.0500,0,0,1,0,0
414,1,0,39.0,0,0,108.9000,1,0,0,1,1
415,3,1,38.5,0,0,7.2500,0,0,1,0,0
416,3,1,38.5,0,0,8.0500,0,0,1,0,0
