<a href="https://colab.research.google.com/github/ShivM99/Python/blob/main/Titanic_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
#Importing the training dataset
import pandas as pd
train = pd.read_csv (r"titanic_train.csv")

In [28]:
#Seggregating the features and the target for 'titanic_train'
x = train.iloc[:, [2, 4, 5, 6, 7, 9, 11]].values
x_df = pd.DataFrame (x, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
print ("\nFeatures:\n", x_df)


Features:
     Pclass     Sex   Age SibSp Parch     Fare Embarked
0        3    male  22.0     1     0     7.25        S
1        1  female  38.0     1     0  71.2833        C
2        3  female  26.0     0     0    7.925        S
3        1  female  35.0     1     0     53.1        S
4        3    male  35.0     0     0     8.05        S
..     ...     ...   ...   ...   ...      ...      ...
886      2    male  27.0     0     0     13.0        S
887      1  female  19.0     0     0     30.0        S
888      3  female   NaN     1     2    23.45        S
889      1    male  26.0     0     0     30.0        C
890      3    male  32.0     0     0     7.75        Q

[891 rows x 7 columns]


In [29]:
y = train.iloc[:, 1].values
y_df = pd.DataFrame (y, columns = ['Survived'])
print ("\nTarget:\n", y_df)


Target:
      Survived
0           0
1           1
2           1
3           1
4           0
..        ...
886         0
887         1
888         0
889         1
890         0

[891 rows x 1 columns]


In [30]:
#Handling the missing values
#One hot encoding the categorical features
print ("\nMissing values in different features:\n", x_df.isnull().sum())
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


Missing values in different features:
 Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64


In [31]:
#ColumnTransformer (transformers = [("name_of_object", object_creation, [columns]), ("name_of_object", object_creation, [columns]), ...], remainder = "passthrough")
ct = ColumnTransformer (transformers = [("si_mode", SimpleImputer (missing_values = np.nan, strategy = "most_frequent"), [2]), ("si_mean", SimpleImputer (missing_values = np.nan, strategy = "mean"), [-2]),  ("encoder", OneHotEncoder(), [0, 1])], remainder = "passthrough")
x = ct.fit_transform (x)

In [32]:
#Pipeline (steps = [("name_of_object", object_creation), ("name_of_object", object_creation), ...])
pipe = Pipeline (steps = [("si_mode", SimpleImputer (missing_values = np.nan, strategy = "most_frequent")), ("encoder", OneHotEncoder())])
embarked = ColumnTransformer (transformers = [("pipe", pipe, [-1])], remainder = "passthrough")
x = embarked.fit_transform (x)
x_df_new = pd.DataFrame (x)
print ("\nPreprocessed features:\n", x_df_new)


Preprocessed features:
       0    1    2     3        4    5    6    7    8    9  10 11
0    0.0  0.0  1.0  22.0     7.25  0.0  0.0  1.0  0.0  1.0  1  0
1    1.0  0.0  0.0  38.0  71.2833  1.0  0.0  0.0  1.0  0.0  1  0
2    0.0  0.0  1.0  26.0    7.925  0.0  0.0  1.0  1.0  0.0  0  0
3    0.0  0.0  1.0  35.0     53.1  1.0  0.0  0.0  1.0  0.0  1  0
4    0.0  0.0  1.0  35.0     8.05  0.0  0.0  1.0  0.0  1.0  0  0
..   ...  ...  ...   ...      ...  ...  ...  ...  ...  ... .. ..
886  0.0  0.0  1.0  27.0     13.0  0.0  1.0  0.0  0.0  1.0  0  0
887  0.0  0.0  1.0  19.0     30.0  1.0  0.0  0.0  1.0  0.0  0  0
888  0.0  0.0  1.0  24.0    23.45  0.0  0.0  1.0  1.0  0.0  1  2
889  1.0  0.0  0.0  26.0     30.0  1.0  0.0  0.0  0.0  1.0  0  0
890  0.0  1.0  0.0  32.0     7.75  0.0  0.0  1.0  0.0  1.0  0  0

[891 rows x 12 columns]


In [33]:
#Splitting the dataset into training & test datasets not done as the dataset is already splitted
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.2, random_state = 0)

In [34]:
#Standardizing the dataset
from sklearn.preprocessing import StandardScaler
sc = StandardScaler ()
x_train[:, [3, 11]] = sc.fit_transform (x_train[:, [3, 11]])
x_test[:, [3, 11]] = sc.transform (x_test[:, [3, 11]])
x_train_df = pd.DataFrame (x_train)
print ("\nStandardised features:\n", x_train_df)


Standardised features:
       0    1    2         3        4    5    6    7    8    9  10        11
0    1.0  0.0  0.0 -0.346969  15.2458  0.0  0.0  1.0  1.0  0.0  0  1.959264
1    0.0  0.0  1.0  0.180174     10.5  0.0  1.0  0.0  0.0  1.0  0  -0.47741
2    1.0  0.0  0.0  0.180174  37.0042  0.0  1.0  0.0  0.0  1.0  1  0.740927
3    1.0  0.0  0.0 -0.648193   4.0125  0.0  0.0  1.0  0.0  1.0  0  -0.47741
4    0.0  0.0  1.0 -0.572887     7.25  0.0  0.0  1.0  0.0  1.0  0  -0.47741
..   ...  ...  ...       ...      ...  ...  ...  ...  ...  ... ..       ...
707  1.0  0.0  0.0  0.782623  83.1583  1.0  0.0  0.0  1.0  0.0  1  0.740927
708  0.0  0.0  1.0 -0.723499   7.8542  0.0  0.0  1.0  1.0  0.0  1  -0.47741
709  0.0  1.0  0.0 -0.346969   7.7333  0.0  0.0  1.0  0.0  1.0  0  -0.47741
710  0.0  0.0  1.0  0.556705     17.4  0.0  0.0  1.0  1.0  0.0  1  -0.47741
711  0.0  0.0  1.0  2.364052     39.0  0.0  1.0  0.0  0.0  1.0  1  0.740927

[712 rows x 12 columns]


In [35]:
#Building the KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier (n_neighbors = 5, weights = "uniform", algorithm = "auto", p =2, metric = "minkowski")
knn.fit (x_train, y_train)
y_pred = knn.predict (x_test)

In [36]:
#Checking the accuracy of KNN model
from sklearn.metrics import accuracy_score
print ("\nAccuracy of the KNN model:", accuracy_score (y_test, y_pred)*100)


Accuracy of the KNN model: 75.97765363128491


In [37]:
#Cross-validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = knn, X = x_train, y = y_train, cv = 10)
print ("\nMean accuracy:", accuracies.mean()*100)


Mean accuracy: 73.73826291079813


In [38]:
#Grid search
from sklearn.model_selection import GridSearchCV
h_parameters = [{"n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "weights": ["uniform", "distance"]}]
grid = GridSearchCV (estimator = knn, param_grid = h_parameters, scoring = "accuracy", n_jobs = -1, cv = 10)
grid.fit (x_train, y_train)
best_hparameters = grid.best_params_
best_accuracy = grid.best_score_
print ("\nBest hyper-parameters:\n", best_hparameters)
print ("\nBest accuracy:", best_accuracy*100)


Best hyper-parameters:
 {'n_neighbors': 3, 'weights': 'uniform'}

Best accuracy: 76.53951486697966


In [39]:
#Importing the testing dataset
test = pd.read_csv (r"titanic_test.csv")
x_new = test.iloc[:, [1, 3, 4, 5, 6, 8, 10]].values
x_df = pd.DataFrame (x_new, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
print ("\nFeatures:\n", x_df)


Features:
     Pclass     Sex   Age SibSp Parch     Fare Embarked
0        3    male  34.5     0     0   7.8292        Q
1        3  female  47.0     1     0      7.0        S
2        2    male  62.0     0     0   9.6875        Q
3        3    male  27.0     0     0   8.6625        S
4        3  female  22.0     1     1  12.2875        S
..     ...     ...   ...   ...   ...      ...      ...
413      3    male   NaN     0     0     8.05        S
414      1  female  39.0     0     0    108.9        C
415      3    male  38.5     0     0     7.25        S
416      3    male   NaN     0     0     8.05        S
417      3    male   NaN     1     1  22.3583        C

[418 rows x 7 columns]


In [40]:
print ("\nMissing values in different features:\n", x_df.isnull().sum())


Missing values in different features:
 Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [41]:
#Handling the missing values
#One hot encoding the categorical features
x_new = ct.transform (x_new)

In [42]:
x_new = embarked.transform (x_new)
x_df_new = pd.DataFrame (x_new)
print ("\nPreprocessed features:\n", x_df_new)


Preprocessed features:
       0    1    2     3        4    5    6    7    8    9  10 11
0    0.0  1.0  0.0  34.5   7.8292  0.0  0.0  1.0  0.0  1.0  0  0
1    0.0  0.0  1.0  47.0      7.0  0.0  0.0  1.0  1.0  0.0  1  0
2    0.0  1.0  0.0  62.0   9.6875  0.0  1.0  0.0  0.0  1.0  0  0
3    0.0  0.0  1.0  27.0   8.6625  0.0  0.0  1.0  0.0  1.0  0  0
4    0.0  0.0  1.0  22.0  12.2875  0.0  0.0  1.0  1.0  0.0  1  1
..   ...  ...  ...   ...      ...  ...  ...  ...  ...  ... .. ..
413  0.0  0.0  1.0  24.0     8.05  0.0  0.0  1.0  0.0  1.0  0  0
414  1.0  0.0  0.0  39.0    108.9  1.0  0.0  0.0  1.0  0.0  0  0
415  0.0  0.0  1.0  38.5     7.25  0.0  0.0  1.0  0.0  1.0  0  0
416  0.0  0.0  1.0  24.0     8.05  0.0  0.0  1.0  0.0  1.0  0  0
417  1.0  0.0  0.0  24.0  22.3583  0.0  0.0  1.0  0.0  1.0  1  1

[418 rows x 12 columns]


In [43]:
#Standardizing the dataset
x_new[:, [3, 11]] = sc.transform (x_new[:, [3, 11]])
x_test_df = pd.DataFrame (x_new)
print ("\nStandardised features:\n", x_test_df)


Standardised features:
       0    1    2         3        4    5    6    7    8    9  10        11
0    0.0  1.0  0.0  0.443746   7.8292  0.0  0.0  1.0  0.0  1.0  0  -0.47741
1    0.0  0.0  1.0  1.385072      7.0  0.0  0.0  1.0  1.0  0.0  1  -0.47741
2    0.0  1.0  0.0  2.514664   9.6875  0.0  1.0  0.0  0.0  1.0  0  -0.47741
3    0.0  0.0  1.0  -0.12105   8.6625  0.0  0.0  1.0  0.0  1.0  0  -0.47741
4    0.0  0.0  1.0 -0.497581  12.2875  0.0  0.0  1.0  1.0  0.0  1  0.740927
..   ...  ...  ...       ...      ...  ...  ...  ...  ...  ... ..       ...
413  0.0  0.0  1.0 -0.346969     8.05  0.0  0.0  1.0  0.0  1.0  0  -0.47741
414  1.0  0.0  0.0  0.782623    108.9  1.0  0.0  0.0  1.0  0.0  0  -0.47741
415  0.0  0.0  1.0   0.74497     7.25  0.0  0.0  1.0  0.0  1.0  0  -0.47741
416  0.0  0.0  1.0 -0.346969     8.05  0.0  0.0  1.0  0.0  1.0  0  -0.47741
417  1.0  0.0  0.0 -0.346969  22.3583  0.0  0.0  1.0  0.0  1.0  1  0.740927

[418 rows x 12 columns]


In [44]:
lr = KNeighborsClassifier (n_neighbors = 3, weights = 'uniform', algorithm = 'auto', p =2, metric = 'minkowski')
lr.fit (x, y)
y_pred = lr.predict (x_new)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,