In [61]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

In [99]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [100]:
train_data = train_data.set_index("PassengerId")
y = train_data.Survived
train_data = train_data.drop(columns=["Survived", "Ticket"]) #Nu cred ca tichetele conteaza, dar vom vedea
test_data = test_data.drop(columns=["Ticket"])

test_data = test_data.set_index("PassengerId")

In [101]:
train_data["Fare"] = train_data["Fare"].fillna(train_data["Fare"].dropna().median())
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].dropna().median())

train_data["Age"] = train_data["Age"].fillna(train_data["Age"].dropna().median())
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].dropna().median())

train_data.loc[train_data['Sex'] == 'male', 'Sex'] = 0
train_data.loc[train_data['Sex'] == 'female', 'Sex'] = 1

test_data.loc[test_data['Sex'] == 'male', 'Sex'] = 0
test_data.loc[test_data['Sex'] == 'female', 'Sex'] = 1

In [102]:
#feature engineering
def extract_title_from_name(name):
    name = name.strip()
    left, right = name.split(',')
    title = right.split()[0].strip('.')
    if title == 'Ms':
        return 'Miss'
    if title in ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Mme', 'the']:
        return "Rare"
    return title 

def extract_family_name(name):
    return name.split(',')[0]

extract_title_from_name("Hewlett, Mrs. (Mary D Kingcome) ")
train_data["Title"] = train_data.Name.map(extract_title_from_name)
test_data["Title"] = test_data.Name.map(extract_title_from_name)

train_data["AgeRange"] = pd.cut(train_data.Age, bins = [0, 12, 20, 40, 120], labels = ["Children", "Teenagers", "Adult", "Elder"])
test_data["AgeRange"] = pd.cut(test_data.Age, bins = [0, 12, 20, 40, 120], labels = ["Children", "Teenagers", "Adult", "Elder"])

train_data["FareRange"] = pd.cut(train_data.Fare, bins = [0, 7.91, 14.45, 31, 120], labels = ["Low", "Median", "Average", "High"])
test_data["FareRange"] = pd.cut(test_data.Fare, bins = [0, 7.91, 14.45, 31, 120], labels = ["Low", "Median", "Average", "High"])

train_data["FamilySize"] = train_data["SibSp"] + train_data["Parch"] + 1
test_data["FamilySize"] = test_data["SibSp"] + test_data["Parch"] + 1

train_data['LastName'] = train_data['Name'].apply(lambda x: x.split(',')[0])
test_data['LastName'] = test_data['Name'].apply(lambda x: x.split(',')[0])

train_data['FamilyGroup'] = train_data['LastName'] + "_" + train_data['FamilySize'].astype(str)
test_data['FamilyGroup'] = test_data['LastName'] + "_" + test_data['FamilySize'].astype(str)

family_cabin_mapping = train_data.dropna(subset=['Cabin']).groupby('FamilyGroup')['Cabin'].agg(lambda x:x.value_counts().index[0])
train_data['Cabin'] = train_data.apply(lambda x: family_cabin_mapping.get(x['FamilyGroup'], x['Cabin']), axis=1)
test_data['Cabin'] = test_data.apply(lambda x: family_cabin_mapping.get(x['FamilyGroup'], x['Cabin']), axis=1)

train_data["FamilySizeCateg"] = pd.cut(train_data.FamilySize, bins = [0, 2, 4, 120], labels = ["Small", "Medium", "Large"])
test_data["FamilySizeCateg"] = pd.cut(test_data.FamilySize, bins = [0, 2, 4, 120], labels = ["Small", "Medium", "Large"])




train_data = train_data.drop(columns=["Age", "Fare", "Parch", "SibSp", "Name", "LastName", "FamilySize", "Sex"])
test_data = test_data.drop(columns=["Age", "Fare", "Parch", "SibSp", "Name", "LastName", "FamilySize", "Sex"])

In [66]:
train_data

Unnamed: 0_level_0,Pclass,Cabin,Embarked,Title,AgeRange,FareRange,FamilyGroup
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,,S,Mr,Adult,Low,Braund_2
2,1,C85,C,Mrs,Adult,High,Cumings_2
3,3,,S,Miss,Adult,Median,Heikkinen_1
4,1,C123,S,Mrs,Adult,High,Futrelle_2
5,3,B5,S,Mr,Adult,Median,Allen_1
...,...,...,...,...,...,...,...
887,2,,S,Rare,Adult,Median,Montvila_1
888,1,B42,S,Miss,Teenagers,Average,Graham_1
889,3,,S,Miss,Adult,Average,Johnston_4
890,1,C148,C,Mr,Adult,Average,Behr_1


In [103]:
#la cabin vom inlocui doar cu litera
train_data.Cabin = train_data.Cabin.map(lambda x: x[0], na_action = "ignore")
test_data.Cabin = test_data.Cabin.map(lambda x: x[0], na_action = "ignore")

# train_data.Cabin = train_data.Cabin.fillna('U')
# test_data.Cabin = test_data.Cabin.fillna('U')

In [104]:
#Sex, Cabin, Embarked -> SimpleImputer -> OneHotEncoded
n_estimators = 2000
learning_rate = 0.01

categorical_columns = [x for x in train_data.columns if train_data[x].dtype == "object"]
categorical_transformer = Pipeline(steps=[("imputing", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))])
column_transformer = ColumnTransformer(transformers=[("cat_trans", categorical_transformer, categorical_columns)])
#my_pipeline = Pipeline(steps=[("preprocessing", column_transformer), ("model", XGBRegressor(n_estimators = n_estimators, learning_rate = learning_rate))])
#my_pipeline = Pipeline(steps=[("preprocessing", column_transformer), ("model", MultinomialNB())])
my_pipeline = Pipeline(steps=[("preprocessing", column_transformer), ("model", MLPClassifier(hidden_layer_sizes=500, learning_rate="adaptive", solver="lbfgs", activation="identity"))])
#my_pipeline = Pipeline(steps=[("preprocessing", column_transformer), ("model", MLPClassifier(hidden_layer_sizes=500, activation="identity"))])

In [105]:
X_train, X_test, y_train, y_test = train_test_split(train_data, y)
my_pipeline.fit(X_train, y_train)

In [34]:
predictions = my_pipeline.predict(X_test)
bool_predictions = []
for pred in predictions:
    if pred < 0.5:
        bool_predictions.append(0)
    else:
        bool_predictions.append(1)
score = accuracy_score(y_test, predictions)
score

0.9910313901345291

In [106]:
cross_score = cross_val_score(my_pipeline, train_data, y)
mean = sum(list(cross_score))/len(list(cross_score))
mean

0.8171050153788212

In [107]:
my_pipeline.fit(train_data, y)
predictions = my_pipeline.predict(test_data)
# bool_predictions = []
# for pred in predictions:
#     if pred < 0.5:
#         bool_predictions.append(0)
#     else:
#         bool_predictions.append(1)
raspuns = pd.Series(data=predictions, index=test_data.index, name="Survived")
raspuns.to_csv("raspuns.csv")

In [91]:
train_data

Unnamed: 0_level_0,Pclass,Cabin,Embarked,Title,AgeRange,FareRange,FamilyGroup
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,,S,Mr,Adult,Low,Braund_2
2,1,C,C,Mrs,Adult,High,Cumings_2
3,3,,S,Miss,Adult,Median,Heikkinen_1
4,1,C,S,Mrs,Adult,High,Futrelle_2
5,3,B,S,Mr,Adult,Median,Allen_1
...,...,...,...,...,...,...,...
887,2,,S,Rare,Adult,Median,Montvila_1
888,1,B,S,Miss,Teenagers,Average,Graham_1
889,3,,S,Miss,Adult,Average,Johnston_4
890,1,C,C,Mr,Adult,Average,Behr_1
