<a href="https://www.kaggle.com/code/andrejzuba/titanic-competition-solution-quick-0-8-result?scriptVersionId=109841333" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#importing custom library
!pip install git+https://github.com/Vrboska/mofr@master

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import random
import mofr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             f1_score, precision_score,
                             recall_score, roc_auc_score)
from xgboost import XGBClassifier, plot_tree

import xgboost as xgb

In [None]:
seed=1234

In [None]:
train=pd.read_csv("/kaggle/input/titanic/train.csv")
test=pd.read_csv("/kaggle/input/titanic/test.csv")
gender_submission=pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
gender_submission.head()

# Data Exploration

In [None]:
train.info()
print('--------------------')
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train.describe(include=['O'])

In [None]:
test.describe(include=['O'])

# Predictor checking

In [None]:
train['Survived'].value_counts(dropna=False, normalize=True)

In [None]:
train.columns

In [None]:
train['Survived'].value_counts()

In [None]:
train['Embarked'].value_counts()

In [None]:
from mofr.basic_evaluators.HistogramContinuous import HistogramContinuousEvaluator

hcoe=HistogramContinuousEvaluator()

In [None]:
hcoe.d(train[train['Age'].fillna(-1)!=-1]).pc('Age')
hcoe.get_graph()
hcoe.get_table()
hcoe.table


In [None]:
hcoe.d(train[train['Fare'].fillna(-1)!=-1]).pc('Fare')
hcoe.get_graph()

In [None]:
from mofr.basic_evaluators.TargetAssociationContinuous import TargetAssociationContinuousEvaluator

train['one']=1
tacoe=TargetAssociationContinuousEvaluator()
tacoe.d(train.fillna(-1)).t([('Survived', 'one')]).pc('Parch').tc('one')
tacoe.get_graph()

In [None]:
from mofr.basic_evaluators.ROCCurve import ROCCurveEvaluator

rce=ROCCurveEvaluator()
rce.d(train.fillna(-1)).t([('Survived', 'one')]).s(['Fare'])
rce.get_graph()

# Data transformations

In [None]:
#extracting title from name
def name_trans(x):
    if 'Mr.' in x:
        return 'Mr.'
    elif 'Mrs.' in x:
        return 'Mrs.'
    elif 'Miss.' in x:
        return 'Miss.'
    elif 'Master.' in x:
        return 'Master.'
    else:
        return 'Other'

In [None]:
train['Name'].apply(name_trans).value_counts()

In [None]:
train['Title']=train['Name'].apply(name_trans)
test['Title']=test['Name'].apply(name_trans)

In [None]:
train['Male_flag']=[1.0 if x=='male' else 0.0 for x in train['Sex']]
test['Male_flag']=[1.0 if x=='male' else 0.0 for x in test['Sex']]

In [None]:
# Ticket count and Cabin count predictors, essentially how many more people were on same ticket or same cabin
train=train.merge(pd.DataFrame(train.groupby('Ticket')['Sex'].count().reset_index()).rename(columns={"Sex":'Ticket_count'}), how='left')
train=train.merge(pd.DataFrame(train.groupby('Ticket')['Sex'].count().reset_index()).rename(columns={"Sex":'Ticket_count'}), how='left')
test=test.merge(pd.DataFrame(test.groupby('Ticket')['Sex'].count().reset_index()).rename(columns={"Sex":'Ticket_count'}), how='left')
train=train.merge(pd.DataFrame(train.groupby('Cabin')['Sex'].count().reset_index()).rename(columns={"Sex":'Cabin_count'}), how='left')
test=test.merge(pd.DataFrame(test.groupby('Cabin')['Sex'].count().reset_index()).rename(columns={"Sex":'Cabin_count'}), how='left')

In [None]:
#Age Fare combined predictor
train['Age_Fare']=train['Age']*train['Fare']
test['Age_Fare']=test['Age']*test['Fare']

In [None]:
train.columns

## Categorical transformations

In [None]:
import category_encoders as ce

In [None]:
cat_preds = ['Title', 'Embarked']

In [None]:
# bayesian target encoding
encoder = ce.TargetEncoder(min_samples_leaf=1, smoothing=1.0)
encoder.fit_transform(train[cat_preds], train['Survived'])

train = pd.concat([train, encoder.transform(train[cat_preds]).add_prefix("BAYES_")], axis=1)
test = pd.concat([test, encoder.transform(test[cat_preds]).add_prefix("BAYES_")], axis=1)

# Fitting models

In [None]:
train['one']=1
test['one']=1

In [None]:
train.columns

In [None]:
col_preds=[
 'Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Ticket_count',
 'Cabin_count',
 'Male_flag',
 'BAYES_Title',
 'BAYES_Embarked',
 'Age_Fare'    
]

In [None]:
plt.figure(figsize=(10, 10))
ax = plt.axes()
fig = sns.heatmap(train[col_preds].corr(), cbar=True, annot=True, cmap="Blues", ax=ax)
ax.set_title("Correlations - training set")
plt.show()

In [None]:
results=[]
for col in col_preds:
    results.append((col, np.abs(mofr.metrics.gini(train['Survived'], train[col].fillna(-999)))))

pd.DataFrame(results, columns=['Predictor', 'GINI']).sort_values(by='GINI', ascending=False)

## xgb_model

In [None]:
# approach taen is cross-validation across many different random samples->then averaging 
#the scores into final score at the end this did not bring much success
whole_lifts=[]
whole_ginis=[]

train_lifts=[]
train_ginis=[]

valid_lifts=[]
valid_ginis=[]

n=1
X_train, X_valid, y_train, y_valid= train_test_split(train[0:700], train['Survived'][0:700], test_size=0.2, random_state=seed)
train['FINAL_SCORE_AVG']=0
X_train['FINAL_SCORE_AVG']=0
X_valid['FINAL_SCORE_AVG']=0
test['FINAL_SCORE_AVG']=0

#train = train.sample(frac=1).reset_index(drop=True)
#test = test.sample(frac=1).reset_index(drop=True)


for random_seed in range(25):
    X_train, X_valid, y_train, y_valid= train_test_split(train[0:700], train['Survived'][0:700], test_size=0.2, random_state=random_seed)

    xgb_model = XGBClassifier(max_depth=3, seed=random_seed, colsample_bytree=1, gamma=1, min_child_weight=5, n_estimators=20)
    xgb_model.fit(X_train.loc[:, col_preds], X_train['Survived'])
    

    train['FINAL_SCORE'] =xgb_model.predict_proba(train[col_preds])[:, 1]
    X_train['FINAL_SCORE'] =xgb_model.predict_proba(X_train[col_preds])[:, 1]
    X_valid['FINAL_SCORE'] = xgb_model.predict_proba(X_valid[col_preds])[:, 1]
    test['FINAL_SCORE'] =xgb_model.predict_proba(test[col_preds])[:, 1]

        
    train['FINAL_SCORE_AVG'] =(train['FINAL_SCORE_AVG']*n+ xgb_model.predict_proba(train[col_preds])[:, 1])/(n+1)
    X_train['FINAL_SCORE_AVG'] =(X_train['FINAL_SCORE_AVG']*n+xgb_model.predict_proba(X_train[col_preds])[:, 1])/(n+1)
    X_valid['FINAL_SCORE_AVG'] = (X_valid['FINAL_SCORE_AVG']*n+xgb_model.predict_proba(X_valid[col_preds])[:, 1])/(n+1)
    test['FINAL_SCORE_AVG'] = (test['FINAL_SCORE_AVG']*n+xgb_model.predict_proba(test[col_preds])[:, 1])/(n+1)  
    n+=1 
    
    whole_lifts.append(mofr.metrics.lift(train['Survived'], train['FINAL_SCORE']))
    whole_ginis.append(mofr.metrics.gini(train['Survived'], train['FINAL_SCORE']))

    train_lifts.append(mofr.metrics.lift(X_train['Survived'], X_train['FINAL_SCORE']))
    train_ginis.append(mofr.metrics.gini(X_train['Survived'], X_train['FINAL_SCORE']))
    
    valid_lifts.append(mofr.metrics.lift(X_valid['Survived'], X_valid['FINAL_SCORE']))
    valid_ginis.append(mofr.metrics.gini(X_valid['Survived'], X_valid['FINAL_SCORE']));

In [None]:
np.std(valid_ginis)

In [None]:
# print('The Lift on the whole set is: '+ str(np.mean(whole_lifts)))
# print('The gini on the whole set is: '+ str(np.mean(whole_ginis)))
# print('\n')
print('The Lift on the train set is: '+ str(mofr.metrics.lift(X_train['Survived'], X_train['FINAL_SCORE_AVG'])))
print('The gini on the train set is: '+ str(mofr.metrics.gini(X_train['Survived'], X_train['FINAL_SCORE_AVG'])))
print('\n')
print('The Lift on the valid set is: '+ str(mofr.metrics.lift(X_valid['Survived'], X_valid['FINAL_SCORE_AVG'])))
print('The gini on the valid set is: '+ str(mofr.metrics.gini(X_valid['Survived'], X_valid['FINAL_SCORE_AVG'])))
print('\n')
print('The Lift on the oot set is: '+ str(mofr.metrics.lift(train['Survived'][700:-1], train['FINAL_SCORE_AVG'][700:-1])))
print('The gini on the oot set is: '+ str(mofr.metrics.gini(train['Survived'][700:-1], train['FINAL_SCORE_AVG'][700:-1])))

In [None]:
test['Survived']=test['FINAL_SCORE_AVG'].apply(lambda x: int(x>0.5))
test['Survived'].value_counts(normalize=True)

In [None]:
test[['PassengerId', 'Survived']].to_csv('Prediction_xgb_avg.csv', index=False)

In [None]:
sorted_idx = xgb_model.feature_importances_.argsort()
order_ = []
for i in sorted_idx:
  order_.append(col_preds[i])
plt.figure(figsize=(5, 9))
fig = plt.barh(order_, xgb_model.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
plt.show()

## Logistic regression model

In [None]:
col_preds

In [None]:
import statsmodels.api as sm

lr_cols=col_preds+['one']

#imputing just average values here
for col in lr_cols:
    train[col]=train[col].fillna(np.mean(train[col]))
    test[col]=test[col].fillna(np.mean(train[col]))

In [None]:
# similar approach taken as for XGBoost, but not averaging the scores at the end
whole_lifts=[]
whole_ginis=[]

train_lifts=[]
train_ginis=[]

valid_lifts=[]
valid_ginis=[]

for random_seed in range(100):
    X_train, X_valid, y_train, y_valid= train_test_split(train, train['Survived'], test_size=0.2, random_state=random_seed)

    X=X_train[lr_cols]
    y=y_train

    logit_model=sm.Logit(y,X)
    result=logit_model.fit()
    
    train['FINAL_SCORE'] = result.predict(train[lr_cols])
    X_train['FINAL_SCORE'] = result.predict(X_train[lr_cols])
    X_valid['FINAL_SCORE'] = result.predict(X_valid[lr_cols])
    
    whole_lifts.append(mofr.metrics.lift(train['Survived'], train['FINAL_SCORE']))
    whole_ginis.append(mofr.metrics.gini(train['Survived'], train['FINAL_SCORE']))

    train_lifts.append(mofr.metrics.lift(X_train['Survived'], X_train['FINAL_SCORE']))
    train_ginis.append(mofr.metrics.gini(X_train['Survived'], X_train['FINAL_SCORE']))
    
    valid_lifts.append(mofr.metrics.lift(X_valid['Survived'], X_valid['FINAL_SCORE']))
    valid_ginis.append(mofr.metrics.gini(X_valid['Survived'], X_valid['FINAL_SCORE']))

In [None]:
np.std(valid_ginis)

In [None]:
# print('The Lift on the whole set is: '+ str(np.mean(whole_lifts)))
# print('The gini on the whole set is: '+ str(np.mean(whole_ginis)))
# print('\n')
print('The Lift on the train set is: '+ str(np.mean(train_lifts)))
print('The gini on the train set is: '+ str(np.mean(train_ginis)))
print('\n')
print('The Lift on the valid set is: '+ str(np.mean(valid_lifts)))
print('The gini on the valid set is: '+ str(np.mean(valid_ginis)))

In [None]:
test['FINAL_SCORE'] = result.predict(test[lr_cols])

In [None]:
threshold=0.6577 #found after some expermenting and submitting a few times

In [None]:
(test['FINAL_SCORE']>threshold).value_counts(normalize=True)

In [None]:
test['Survived']=(test['FINAL_SCORE']>threshold).apply(int)

In [None]:
test[['PassengerId', 'Survived']].to_csv('Prediction_lr.csv', index=False)

In [None]:
test[['PassengerId', 'Survived']].to_csv('submission.csv', index=False)