In [1]:
import pandas as pd

In [63]:
import numpy as np

In [2]:
import pathlib as pl

In [3]:
data_folder = pl.Path("data/titanic")

In [10]:
titanic_train = pd.read_csv(data_folder / "train.csv")

In [12]:
titanic_train = titanic_train.drop(columns=["Cabin"]).dropna(how='any')

In [14]:
titanic_train["Survived"] = titanic_train["Survived"].replace({0:"No", 1: "Yes"})

In [26]:
titanic_train["Embarked"] = titanic_train["Embarked"].replace({'C':"Cherbourg", 'Q': "Queenstown", 'S':"Southampton"})

In [35]:
titanic_train["Parents or Children on Board"] = (titanic_train["Parch"] > 0).replace({False:"No", True: "Yes"})

In [36]:
titanic_train["Siblings or Spouse on Board"] = (titanic_train["SibSp"] > 0).replace({False:"No", True: "Yes"})

In [None]:
C = Cherbourg, Q = Queenstown, S = Southampton

In [33]:
titanic_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Parents on Board
0,1,No,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Southampton,False
1,2,Yes,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,Cherbourg,False
2,3,Yes,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Southampton,False
3,4,Yes,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,Southampton,False
4,5,No,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Southampton,False
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,No,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,Queenstown,True
886,887,No,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Southampton,False
887,888,Yes,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,Southampton,False
889,890,Yes,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,Cherbourg,False


In [29]:
def get_train_sample(df, feature, how_many=20):
    return df.sample(how_many)[["Survived", feature]].sort_values(feature).reset_index(drop=True)

In [31]:
get_train_sample(titanic_train, "Age")

Unnamed: 0,Survived,Age
0,Yes,1.0
1,Yes,1.0
2,No,2.0
3,Yes,3.0
4,No,18.0
5,Yes,22.0
6,No,25.0
7,No,26.0
8,No,28.0
9,Yes,28.0


In [37]:
sample_folder = pl.Path("data/samples")

In [40]:
all_features = ["Age", "Sex", "Fare", "Embarked", "Parents or Children on Board", "Siblings or Spouse on Board"]

In [38]:
for feature in all_features:
    get_train_sample(titanic_train, feature).to_csv(sample_folder / f"{feature}.csv")

In [43]:
def get_sample_for_predction(df, features):
    sample = df.sample(1)
    return sample[features], sample["Survived"]
    

In [46]:
pred_sample = get_sample_for_predction(titanic_train, all_features)

In [56]:
pred_sample[0]["Age"].values[0]

0.83

In [48]:
pred_sample[1]

831    Yes
Name: Survived, dtype: object

In [60]:
class Predictor:
    def __init__(self, feature_name):
        self.feature_name = feature_name
        
class CategoricalPredictor(Predictor):
    def __init__(self, feature_name, prediction_dict):
        super().__init__(feature_name)
        self.prediction_dict = prediction_dict
        
    def predict(self, sample):
        feature_value = sample[self.feature_name].values[0]
        return self.prediction_dict[feature_value]
    
class ContinuousPredictor(Predictor):
    def __init__(self, feature_name, split_point, lower_pred, higher_pred):
        super().__init__(feature_name)
        self.split_point = split_point
        self.lower_pred = lower_pred
        self.higher_pred = higher_pred
        
    def predict(self, sample):
        feature_value = sample[self.feature_name].values[0]
        if feature_value <= self.split_point:
            return self.lower_pred
        else:
            return self.higher_pred
    

In [128]:
predictor_collection = [
    CategoricalPredictor(
        "Sex",
        {
            "female": 5/8,
            "male": 1/12
        }
    ),
#     CategoricalPredictor(
#         "Embarked",
#         {
#             "Cherbourg": 0.75,
#             "Queenstown": 0.5,
#             "Southampton": 0.5
#         }
#     ),
#     ContinuousPredictor(
#         "Age",
#         split_point=32,
#         lower_pred=0.5,
#         higher_pred=0.33
#     ),
#     ContinuousPredictor(
#         "Fare",
#         split_point=15,
#         lower_pred=0,
#         higher_pred=9/13
#     )
]

In [112]:
def predict_collection(a_pred_collection, a_sample):
    predictions = [predictor.predict(a_sample[0]) for predictor in a_pred_collection]
    mean_predictions = np.mean(predictions)
    if mean_predictions >= 0.5:
        prediction = "Yes"
    else:
        prediction = "No"
    return prediction

In [125]:
def get_sample_from_row(df, features, i):
    sample = df.iloc[[i]]
    return sample[features], sample["Survived"]

In [129]:
n_correct = 0
for i in range(len(titanic_train)):
    sample = get_sample_from_row(titanic_train, all_features, i)
    prediction = predict_collection(predictor_collection, sample)
    if prediction == sample[1].values[0]:
        n_correct += 1

In [131]:
n_correct/len(titanic_train)

0.7794943820224719