In [154]:
import pandas as pd
import numpy as np

In [155]:
train_df = pd.read_csv(
    "/Users/rahulanil/garchomp/projects/kaggle/titanic/data/train.csv"
)
test_df = pd.read_csv("/Users/rahulanil/garchomp/projects/kaggle/titanic/data/test.csv")

In [156]:
import titanic_preprocessing as tp

test_df_passengeId = test_df["PassengerId"]
train_df, test_df = tp.generic_perprocessing(train_df, test_df)

In [157]:
X = train_df.loc[:, train_df.columns != "Survived"]
y = train_df["Survived"]

In [158]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Survived = y
# OneHotEncoding: Pclass, Sex, Embarked, Initials
# StandardScalar: Age, SibSp, Parch, Fare, Family_size

ct = ColumnTransformer(
    [
        (
            "onehot",
            OneHotEncoder(sparse=False),
            ["Pclass", "Sex", "Embarked", "Initials"],
        ),
        (
            "StandardScaler",
            StandardScaler(),
            ["Age", "SibSp", "Parch", "Fare", "Family_size"],
        ),
    ]
)

ct.fit(X)
X_ct = ct.transform(X)
print(f"X_ct shpae: {X_ct.shape}")

test = ct.transform(test_df)
print(f"X_test shpae: {X_test.shape}")

X_ct shpae: (889, 21)
X_test shpae: (223, 21)


In [159]:
ct.get_feature_names_out()

array(['onehot__Pclass_1', 'onehot__Pclass_2', 'onehot__Pclass_3',
       'onehot__Sex_female', 'onehot__Sex_male', 'onehot__Embarked_C',
       'onehot__Embarked_Q', 'onehot__Embarked_S', 'onehot__Initials_Dr',
       'onehot__Initials_F_N', 'onehot__Initials_M_N',
       'onehot__Initials_Master', 'onehot__Initials_Miss',
       'onehot__Initials_Mr', 'onehot__Initials_Mrs',
       'onehot__Initials_Special', 'StandardScaler__Age',
       'StandardScaler__SibSp', 'StandardScaler__Parch',
       'StandardScaler__Fare', 'StandardScaler__Family_size'],
      dtype=object)

In [160]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_ct, y)
print(f"train score: {clf.score(X_ct, y)}")

train score: 0.8706411698537683


In [161]:
scores = cross_val_score(clf, X_ct, y, cv=5)
print(f"scores: {scores}")
print(f"scores mean: {scores.mean()}")

scores: [0.7752809  0.76966292 0.86516854 0.80898876 0.80225989]
scores mean: 0.8042722021202311


In [162]:
from sklearn.model_selection import GridSearchCV

param_grid = {"n_neighbors": [i for i in range(1, 10)]}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, return_train_score=True)

In [163]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_ct, y, random_state=0)

grid_search.fit(X_train, y_train)
print(f"test scores: {grid_search.score(X_test, y_test)}")
print(f"Best parametesrs: {grid_search.best_params_}")
print(f"best cross validation score: {grid_search.best_score_}")

test scores: 0.7713004484304933
Best parametesrs: {'n_neighbors': 9}
best cross validation score: 0.8393558523173607


In [164]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_ct, y)
print(clf.score(X_ct, y))

y_test = pd.Series(clf.predict(test).astype(int), name="Survived")
result = pd.concat([test_df_passengeId, y_test], axis=1)
result.to_csv("KNN.csv", index=False)

display(result)

0.8706411698537683


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
