In [5]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


In [6]:
import sklearn
print(sklearn.__version__)

0.21.2


In [7]:
data_path = 'D:/Datasets/Titanic/train_test_cabin.csv'

In [8]:
data = pd.read_csv(data_path).round()
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.0,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.0,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,8.0,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.0,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0,,S


In [9]:
# Drop column Name and Ticket 
data_1 = data.drop(['Name', 'Ticket'], axis=1)
data_1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0.0,3,male,22.0,1,0,7.0,,S
1,2,1.0,1,female,38.0,1,0,71.0,C85,C
2,3,1.0,3,female,26.0,0,0,8.0,,S
3,4,1.0,1,female,35.0,1,0,53.0,C123,S
4,5,0.0,3,male,35.0,0,0,8.0,,S


In [10]:
# Split up the NaN Cabin from the Cabin with values 

nan_cabin = data_1[data_1['Cabin'].isnull()]
nan_cabin.isnull().sum()

PassengerId       0
Survived        327
Pclass            0
Sex               0
Age             240
SibSp             0
Parch             0
Fare              1
Cabin          1014
Embarked          0
dtype: int64

# Preprocess/Clean Train data 

In [11]:
train_data = data_1[data_1['Cabin'].notnull()]
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
1,2,1.0,1,female,38.0,1,0,71.0,C85,C
3,4,1.0,1,female,35.0,1,0,53.0,C123,S
6,7,0.0,1,male,54.0,0,0,52.0,E46,S
10,11,1.0,3,female,4.0,1,1,17.0,G6,S
11,12,1.0,1,female,58.0,0,0,27.0,C103,S


In [12]:
train_data.isnull().sum()

PassengerId     0
Survived       91
Pclass          0
Sex             0
Age            23
SibSp           0
Parch           0
Fare            0
Cabin           0
Embarked        2
dtype: int64

In [13]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,295.0,204.0,295.0,272.0,295.0,295.0,295.0
mean,654.854237,0.666667,1.186441,36.915441,0.481356,0.444068,81.932203
std,371.45003,0.472564,0.510921,15.592251,0.632583,0.766712,79.01478
min,2.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,331.0,0.0,1.0,25.0,0.0,0.0,30.0
50%,660.0,1.0,1.0,36.0,0.0,0.0,57.0
75%,963.0,1.0,1.0,48.0,1.0,1.0,92.5
max,1306.0,1.0,3.0,80.0,3.0,4.0,512.0


In [14]:
train_data.Survived.median()

1.0

In [15]:
train_data['Age'].median()

36.0

In [16]:
# I only want the letter in Cabin and not the numeric value that follows. So I want the section without the room number. 

train_data.Cabin.sort_values().unique()

array(['A10', 'A11', 'A14', 'A16', 'A18', 'A19', 'A20', 'A21', 'A23',
       'A24', 'A26', 'A29', 'A31', 'A32', 'A34', 'A36', 'A5', 'A6', 'A7',
       'A9', 'B10', 'B101', 'B102', 'B11', 'B18', 'B19', 'B20', 'B22',
       'B24', 'B26', 'B28', 'B3', 'B30', 'B35', 'B36', 'B37', 'B38',
       'B39', 'B4', 'B41', 'B42', 'B45', 'B49', 'B5', 'B50',
       'B51 B53 B55', 'B52 B54 B56', 'B57 B59 B63 B66', 'B58 B60', 'B61',
       'B69', 'B71', 'B73', 'B77', 'B78', 'B79', 'B80', 'B82 B84', 'B86',
       'B94', 'B96 B98', 'C101', 'C103', 'C104', 'C105', 'C106', 'C110',
       'C111', 'C116', 'C118', 'C123', 'C124', 'C125', 'C126', 'C128',
       'C130', 'C132', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C28',
       'C30', 'C31', 'C32', 'C39', 'C45', 'C46', 'C47', 'C49', 'C50',
       'C51', 'C52', 'C53', 'C54', 'C55 C57', 'C6', 'C62 C64', 'C65',
       'C68', 'C7', 'C70', 'C78', 'C80', 'C82', 'C83', 'C85', 'C86',
       'C87', 'C89', 'C90', 'C91', 'C92', 'C93', 'C95', 'C97', 'C99', 'D',
       'D

In [17]:
# A, B, C, D, E, F, G, T 

def cabin_letter(cabin):
    """Remove the numbers from the letter in cabin"""
    list_a = [i for i in cabin['Cabin'] if i.startswith('A')]
    list_b = [i for i in cabin['Cabin'] if i.startswith('B')]
    list_c = [i for i in cabin['Cabin'] if i.startswith('C')]
    list_d = [i for i in cabin['Cabin'] if i.startswith('D')]
    list_e = [i for i in cabin['Cabin'] if i.startswith('E')]
    list_f = [i for i in cabin['Cabin'] if i.startswith('F')]
    list_g = [i for i in cabin['Cabin'] if i.startswith('G')]
    list_t = [i for i in cabin['Cabin'] if i.startswith('T')]

    cabin['Cabin_Section'] = cabin['Cabin'].replace(list_a, 'A').replace(list_b, 'B').replace(list_c, 'C').replace(list_d, 'D').replace(list_e, 'E').replace(list_f, 'F').replace(list_g, 'G').replace(list_t, 'T')
    return cabin.drop(['Cabin'], axis=1) 

In [18]:
train = cabin_letter(train_data)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Section
1,2,1.0,1,female,38.0,1,0,71.0,C,C
3,4,1.0,1,female,35.0,1,0,53.0,S,C
6,7,0.0,1,male,54.0,0,0,52.0,S,E
10,11,1.0,3,female,4.0,1,1,17.0,S,G
11,12,1.0,1,female,58.0,0,0,27.0,S,C


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 295 entries, 1 to 1305
Data columns (total 10 columns):
PassengerId      295 non-null int64
Survived         204 non-null float64
Pclass           295 non-null int64
Sex              295 non-null object
Age              272 non-null float64
SibSp            295 non-null int64
Parch            295 non-null int64
Fare             295 non-null float64
Embarked         293 non-null object
Cabin_Section    295 non-null object
dtypes: float64(3), int64(4), object(3)
memory usage: 25.4+ KB


In [20]:
train.isnull().sum()

PassengerId       0
Survived         91
Pclass            0
Sex               0
Age              23
SibSp             0
Parch             0
Fare              0
Embarked          2
Cabin_Section     0
dtype: int64

In [21]:
X = train.drop(['Cabin_Section'], axis=1)
y = train['Cabin_Section']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Create pipeline 

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

num_attribs = list(['Age', 'Survived'])
cat_attribs = list(['Embarked', 'Sex'])

# Survived and Age 
num_pipeline  = Pipeline([
    ('num_imputer', SimpleImputer(strategy='median')),
    ('stadard_scale', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse=False)),
])

full_pipeline = ColumnTransformer([
    ('num_attribs', num_pipeline, num_attribs),
    ('cat_attribs', cat_pipeline, cat_attribs)

])

In [23]:
X_trained = full_pipeline.fit_transform(X_train)

# Select models 

In [24]:
X_tested = full_pipeline.transform(X_test)

In [27]:
from sklearn.svm import LinearSVC

lin_svc = LinearSVC(C=1, loss='hinge', random_state=42)
lin_svc.fit(X_trained, y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=42, tol=0.0001, verbose=0)

In [28]:
from sklearn.ensemble import RandomForestClassifier

forest_clf_1 = RandomForestClassifier(n_estimators=500, 
max_leaf_nodes=16,
n_jobs=-1,
max_depth=3,
random_state=42)

forest_clf_1.fit(X_trained, y_train)

  return f(*args, **kwds)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [30]:
forest_clf_2 = RandomForestClassifier(n_estimators=500, 
min_samples_leaf=3,
n_jobs=-1,
random_state=42)

forest_clf_2.fit(X_trained, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [33]:
# OneVsRestClassifier 

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# 8 classes 

svc_clf = SVC(C=1, kernel='poly')
ovr_clf = OneVsRestClassifier(svc_clf, n_jobs=-1)
ovr_clf.fit(X_trained, y_train)

OneVsRestClassifier(estimator=SVC(C=1, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma='auto_deprecated',
                                  kernel='poly', max_iter=-1, probability=False,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=False),
                    n_jobs=-1)

# Metrics/Eval

## <u>Metrics for lin_svc classifier</u> 

In [34]:
from sklearn.metrics import classification_report

print(classification_report(y_test, lin_svc.predict(X_tested)))

              precision    recall  f1-score   support

           A       0.00      0.00      0.00         4
           B       0.17      0.07      0.10        15
           C       0.50      0.06      0.11        16
           D       0.00      0.00      0.00        14
           E       0.07      0.33      0.11         6
           F       0.00      0.00      0.00         4
           G       0.00      0.00      0.00         0
           T       0.00      0.00      0.00         0

    accuracy                           0.07        59
   macro avg       0.09      0.06      0.04        59
weighted avg       0.18      0.07      0.07        59

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [35]:
from sklearn.model_selection import cross_val_score

svc_score = cross_val_score(lin_svc, X_trained, y_train, cv=10)
svc_score.mean()



0.19366993232210622

## <u>Metrics for RandomForestClassifier_1</u> 

In [36]:
print(classification_report(y_test, forest_clf_1.predict(X_tested)))

              precision    recall  f1-score   support

           A       0.00      0.00      0.00         4
           B       0.67      0.13      0.22        15
           C       0.27      0.94      0.42        16
           D       0.00      0.00      0.00        14
           E       0.00      0.00      0.00         6
           F       0.00      0.00      0.00         4
           G       0.00      0.00      0.00         0

    accuracy                           0.29        59
   macro avg       0.13      0.15      0.09        59
weighted avg       0.24      0.29      0.17        59

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [37]:
forest_score = cross_val_score(forest_clf_1, X_trained, y_train, cv=10)
forest_score.mean()



0.3295812754073624

## <u>Metrics for RandomForestClassifier_2</u> 

In [41]:
print(classification_report(y_test, forest_clf_2.predict(X_tested)))


              precision    recall  f1-score   support

           A       0.00      0.00      0.00         4
           B       0.40      0.27      0.32        15
           C       0.27      0.62      0.38        16
           D       0.60      0.21      0.32        14
           E       0.50      0.33      0.40         6
           F       0.00      0.00      0.00         4
           G       0.00      0.00      0.00         0

    accuracy                           0.32        59
   macro avg       0.25      0.21      0.20        59
weighted avg       0.37      0.32      0.30        59

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [42]:
forest_score_2 = cross_val_score(forest_clf_2, X_trained, y_train, cv=10)
forest_score_2.mean()



0.2780946589642242

## <u>ovr_clf Metrics/Score</u>

In [38]:
print(classification_report(y_test, ovr_clf.predict(X_tested)))

              precision    recall  f1-score   support

           A       0.00      0.00      0.00         4
           B       0.26      0.33      0.29        15
           C       0.26      0.38      0.31        16
           D       0.00      0.00      0.00        14
           E       0.22      0.33      0.27         6
           F       0.00      0.00      0.00         4
           G       0.00      0.00      0.00         0
           T       0.00      0.00      0.00         0

    accuracy                           0.22        59
   macro avg       0.09      0.13      0.11        59
weighted avg       0.16      0.22      0.19        59

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [39]:
ovr_score = cross_val_score(ovr_clf, X_trained, y_train)
ovr_score.mean()



0.2039506172839506

In [40]:
ovr_clf.classes_

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'], dtype='<U1')

According to Data Science Stack Exchange the scores are too low to continue since lack of data and attributes. 