In [2]:
import pandas as pd
import os
import sklearn
from sklearn import ensemble 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
from sklearn import feature_selection
from sklearn import neighbors
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from feature_engine.selection import DropDuplicateFeatures
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
sklearn.set_config(display="diagram")
#pip install feature_engine

In [3]:
dir = "C:/Users/pc/Downloads/ai-level1/titanic"
titanic_train = pd.read_csv(os.path.join(dir, "train.csv"))
titanic_train.shape
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
titanic_train['Dummy'] = 10

In [5]:
#create title feature from name
def extract_title(name):
     return name.split(',')[1].split('.')[0].strip()
titanic_train['Title'] = titanic_train['Name'].map(extract_title)
titanic_train['Title'] = titanic_train['Title'].replace(['Lady', 'the Countess', 'Countess','Capt', 'Col','Don', 
                                                 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic_train['Title'] = titanic_train['Title'].replace('Mlle', 'Miss')
titanic_train['Title'] = titanic_train['Title'].replace('Ms', 'Miss')
titanic_train['Title'] = titanic_train['Title'].replace('Mme', 'Mrs')

In [6]:
#create family size feature from sibsp, parch
titanic_train['FamilySize'] = titanic_train['SibSp'] +  titanic_train['Parch'] + 1
def convert_familysize(size):
    if(size == 1): 
        return 'Single'
    elif(size <= 4):
        return 'Small'
    elif(size <= 6):
        return 'Medium'
    else: 
        return 'Large'
titanic_train['FamilyGroup'] = titanic_train['FamilySize'].map(convert_familysize)

In [7]:
cat_features = ['Embarked', 'Sex', 'Pclass', 'Title', 'FamilyGroup']
steps = [('cat_imp', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder())]
cat_pipe = pipeline.Pipeline(steps)

In [8]:
ord_features = ['Age', 'Fare']
steps = [('cont_imp', impute.SimpleImputer()),('binning', preprocessing.KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='quantile'))]
ord_pipe = pipeline.Pipeline(steps)

In [9]:
pass_features = ['Dummy']
pre_pipe = compose.ColumnTransformer(
    [ ("categorical", cat_pipe, cat_features), ("ordinal", ord_pipe, ord_features), ("pass", "passthrough", pass_features) ]
)

In [10]:
features_pipe = pipeline.FeatureUnion([
                    ('stats_selector', feature_selection.SelectKBest() ),
                    ('rf_selector', feature_selection.SelectFromModel(ensemble.RandomForestClassifier()) )
                ])

In [11]:
pipe = pipeline.Pipeline([  
                    ('preprocess', pre_pipe),
                    ('zv_filter', feature_selection.VarianceThreshold()),
                    ('features', features_pipe),
                    ('de-duplicated', DropDuplicateFeatures()),
                    ('bst', ensemble.GradientBoostingClassifier())
                ])
pipe

In [12]:
X_train = titanic_train
y_train = titanic_train['Survived']
pipe_grid = {'features__stats_selector__k': [3,4,5], 'features__rf_selector__threshold': [0.02,0.03], 
             'bst__n_estimators':[10, 20] }
cv = model_selection.KFold(10)
clf = model_selection.GridSearchCV(pipe, pipe_grid, cv=cv, scoring='accuracy',return_train_score=True)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.best_index_)
print(clf.best_estimator_)

{'bst__n_estimators': 20, 'features__rf_selector__threshold': 0.02, 'features__stats_selector__k': 4}
0.8305118601747814
7
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('cat_imp',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder())]),
                                                  ['Embarked', 'Sex', 'Pclass',
                                                   'Title', 'FamilyGroup']),
                                                 ('ordinal',
                                                  Pipeline(steps=[('cont_imp',
                                                                   SimpleImputer()),
                                                   

In [12]:
f = clf.best_estimator_[0].get_feature_names_out()
print(f)
f = clf.best_estimator_[1].get_feature_names_out(input_features = f)
print(f)
f = clf.best_estimator_[2].get_feature_names_out(input_features = f)
print(f)
f = clf.best_estimator_[3].get_feature_names_out(input_features = f)
print(f)

['categorical__Embarked_C' 'categorical__Embarked_Q'
 'categorical__Embarked_S' 'categorical__Sex_female'
 'categorical__Sex_male' 'categorical__Pclass_1' 'categorical__Pclass_2'
 'categorical__Pclass_3' 'categorical__Title_Master'
 'categorical__Title_Miss' 'categorical__Title_Mr'
 'categorical__Title_Mrs' 'categorical__Title_Rare'
 'categorical__FamilyGroup_Large' 'categorical__FamilyGroup_Medium'
 'categorical__FamilyGroup_Single' 'categorical__FamilyGroup_Small'
 'ordinal__Age' 'ordinal__Fare' 'pass__Dummy']
['categorical__Embarked_C' 'categorical__Embarked_Q'
 'categorical__Embarked_S' 'categorical__Sex_female'
 'categorical__Sex_male' 'categorical__Pclass_1' 'categorical__Pclass_2'
 'categorical__Pclass_3' 'categorical__Title_Master'
 'categorical__Title_Miss' 'categorical__Title_Mr'
 'categorical__Title_Mrs' 'categorical__Title_Rare'
 'categorical__FamilyGroup_Large' 'categorical__FamilyGroup_Medium'
 'categorical__FamilyGroup_Single' 'categorical__FamilyGroup_Small'
 'ordinal__

In [23]:
titanic_test = pd.read_csv(os.path.join(dir, "test.csv"))
print(titanic_test.shape)
titanic_test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [24]:
titanic_test['Dummy'] = 10

In [25]:
titanic_test['Title'] = titanic_test['Name'].map(extract_title)
titanic_test['Title'] = titanic_test['Title'].replace(['Lady', 'the Countess', 'Countess','Capt', 'Col','Don', 
                                                 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic_test['Title'] = titanic_test['Title'].replace('Mlle', 'Miss')
titanic_test['Title'] = titanic_test['Title'].replace('Ms', 'Miss')
titanic_test['Title'] = titanic_test['Title'].replace('Mme', 'Mrs')

In [26]:
titanic_test['FamilySize'] = titanic_test['SibSp'] +  titanic_test['Parch'] + 1
titanic_test['FamilyGroup'] = titanic_test['FamilySize'].map(convert_familysize)

In [27]:
X_test = titanic_test
titanic_test['Survived'] = clf.predict(X_test)
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Dummy,Title,FamilySize,FamilyGroup,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,10,Mr,1,Single,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,10,Mrs,2,Small,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,10,Mr,1,Single,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,10,Mr,1,Single,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,10,Mrs,3,Small,0


In [180]:
titanic_test.to_csv(os.path.join(dir, "submission1.csv"), columns=["PassengerId", "Survived"], index=False)