In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
rf = pd.read_csv("Titanic-Dataset.csv")

rf['Sex'] = rf['Sex'].replace({'male': 0, 'female': 1})

rf['Age'].fillna(rf['Age'].median(), inplace=True)


rf = pd.get_dummies(rf, columns=['Embarked'], drop_first=True)

rf.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)


print(rf.isna().sum())

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_Q    0
Embarked_S    0
dtype: int64


In [3]:
rf[['Embarked_Q','Embarked_S']] = rf[['Embarked_Q','Embarked_S']].astype(int)
rf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,1
1,1,1,1,38.0,1,0,71.2833,0,0
2,1,3,1,26.0,0,0,7.925,0,1
3,1,1,1,35.0,1,0,53.1,0,1
4,0,3,0,35.0,0,0,8.05,0,1


In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scle_col = ['Age','Fare']
rf[scle_col] = scaler.fit_transform(rf[scle_col])
rf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,-0.565736,1,0,-0.502445,0,1
1,1,1,1,0.663861,1,0,0.786845,0,0
2,1,3,1,-0.258337,0,0,-0.488854,0,1
3,1,1,1,0.433312,1,0,0.42073,0,1
4,0,3,0,0.433312,0,0,-0.486337,0,1


In [5]:
scle_col2 = ['Pclass','Sex','SibSp','Parch','Age','Fare','Embarked_Q','Embarked_S']
rf[scle_col2] = scaler.fit_transform(rf[scle_col2])
rf.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,0.827377,-0.737695,-0.565736,0.432793,-0.473674,-0.502445,-0.307562,0.619306
1,1,-1.566107,1.355574,0.663861,0.432793,-0.473674,0.786845,-0.307562,-1.61471
2,1,0.827377,1.355574,-0.258337,-0.474545,-0.473674,-0.488854,-0.307562,0.619306
3,1,-1.566107,1.355574,0.433312,0.432793,-0.473674,0.42073,-0.307562,0.619306
4,0,0.827377,-0.737695,0.433312,-0.474545,-0.473674,-0.486337,-0.307562,0.619306


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
X = rf.drop('Survived', axis=1)
Y = rf['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)       
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
Y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1])

In [7]:
knn.score(X_test,Y_test)

0.8100558659217877

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
clf = RandomizedSearchCV(
    knn,
    param_distributions={'n_neighbors': np.arange(1, 150, 2)},
    n_iter=25,   
    cv=5,
    scoring='accuracy',
    random_state=42
)
clf.fit(X, Y)
print("Best parameters:", clf.best_params_)
print("Best cross-validation score:", clf.best_score_)
print("Best estimator:", clf.best_estimator_)


Best parameters: {'n_neighbors': np.int64(21)}
Best cross-validation score: 0.809189630280585
Best estimator: KNeighborsClassifier(n_neighbors=np.int64(21))


In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
X = rf.drop('Survived', axis=1)
Y = rf['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)       
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)


In [11]:
knn.score(X_test, Y_test)

0.7932960893854749