<a href="https://colab.research.google.com/github/RP272/Hands-On-ML/blob/main/Classification_Titanic_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unzip the dataset and load the data

In [1]:
!unzip titanic.zip -d /content/titanic
%cd /content/titanic

Archive:  titanic.zip
  inflating: /content/titanic/gender_submission.csv  
  inflating: /content/titanic/test.csv  
  inflating: /content/titanic/train.csv  
/content/titanic


In [6]:
import pandas as pd
import os

def load_titanic_train_dataset(dataset_path):
  csv_path = os.path.join(dataset_path, "train.csv")
  return pd.read_csv(csv_path)

def load_titanic_test_dataset(dataset_path):
  csv_path = os.path.join(dataset_path, "test.csv")
  return pd.read_csv(csv_path)

In [32]:
dir_path = "/content/titanic"

train_set = load_titanic_train_dataset(dir_path)
test_set = load_titanic_test_dataset(dir_path)

# Get data description. Clean the data

In [65]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [33]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
train_set.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Train set is missing age value for 177 passengers. The missing values will be filled with median value in transform pipeline.

In [None]:
test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
test_set.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


Test set has the same problem. It has 86 missing values for age feature and 1 missing value for fare feature. Missing values will be filled by transform pipeline.

The other part of data cleaning will be related to text values. I think that no real value comes from *Name* column so there should be the possibility to drop this column. The *Sex* column will be binary encoded as 0 - male and 1 for female. *Ticket* and *Cabin* columns will also be dropped. The *Embarked* column has 3 possible values, so it will be encoded using the one-hot technique.

In [111]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_attribs = ["Pclass", "Age", "Fare"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), ["Embarked", "Sex"])
])

In [112]:
titanic_train_prepared = full_pipeline.fit_transform(train_set)
y_train = train_set["Survived"]

# KNeighborsClassifier

In [113]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

knn_clf = KNeighborsClassifier()
knn_scores = cross_val_score(knn_clf, titanic_train_prepared, y_train, cv=10, scoring="accuracy")
print(knn_scores.mean())

0.8204744069912608


In [114]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'weights': ["uniform", "distance"], 'n_neighbors': [3, 5, 7, 11]}
]

grid_search = GridSearchCV(knn_clf, param_grid, cv=3, scoring="accuracy")
grid_search.fit(titanic_train_prepared, y_train)

cvres = grid_search.cv_results_
print(grid_search.best_params_)

for accuracy, params in zip(cvres["mean_test_score"], cvres["params"]):
  print(accuracy, params)

{'n_neighbors': 5, 'weights': 'uniform'}
0.797979797979798 {'n_neighbors': 3, 'weights': 'uniform'}
0.7934904601571269 {'n_neighbors': 3, 'weights': 'distance'}
0.8013468013468014 {'n_neighbors': 5, 'weights': 'uniform'}
0.7934904601571269 {'n_neighbors': 5, 'weights': 'distance'}
0.8013468013468014 {'n_neighbors': 7, 'weights': 'uniform'}
0.7934904601571268 {'n_neighbors': 7, 'weights': 'distance'}
0.7923681257014591 {'n_neighbors': 11, 'weights': 'uniform'}
0.7901234567901234 {'n_neighbors': 11, 'weights': 'distance'}


In [116]:
best_knn = grid_search.best_estimator_

test_prepared = full_pipeline.transform(test_set)
y_pred = best_knn.predict(test_prepared)
id_start = 892

with open("final.csv", "w+") as file:
  file.write("PassengerId,Survived\n")
  for idx in range(len(y_pred)):
    file.write(f"{id_start + idx},{y_pred[idx]}\n")

The KNN version achieved 0.74401 accuracy for test dataset.