In [139]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

In [43]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
combine = [train_data,test_data]

In [77]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,1,0,22.0,1,0,A/5 21171,7.25,0,S
1,2,1,1,3,1,38.0,1,0,PC 17599,71.2833,1,C
2,3,1,3,2,1,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,3,1,35.0,1,0,113803,53.1,1,S
4,5,0,3,1,0,35.0,0,0,373450,8.05,0,S


In [27]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attr_name):
        self.attr_name = attr_name
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attr_name].values

In [44]:
for dataset in combine:
    dataset['Name'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [48]:
for dataset in combine:
    dataset['Name'] = dataset['Name'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    dataset['Name'] = dataset['Name'].replace('Mlle', 'Miss')
    dataset['Name'] = dataset['Name'].replace('Ms', 'Miss')
    dataset['Name'] = dataset['Name'].replace('Mme', 'Mrs')

In [61]:
train_data['Sex'] = (train_data['Sex'] == 'female').astype(int)

In [63]:
train_data['Cabin'] = (train_data['Cabin'].isna()==False).astype(int)

In [67]:
train_data['Name'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Other      23
Name: Name, dtype: int64

In [70]:
mapping = {'Mr':1, 'Miss':2,'Mrs':3,'Master':4,'Other':5}
for dataset in combine:
    dataset['Name']=dataset['Name'].map(mapping)

In [72]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null int64
Sex            891 non-null int32
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null int32
Embarked       889 non-null object
dtypes: float64(2), int32(2), int64(6), object(2)
memory usage: 76.6+ KB


In [91]:
num_attr = ['Age','Fare']
cat_attr = ['Embarked']
num_pipe = Pipeline([
    ('selector',DataFrameSelector(num_attr)),
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler()),
])
cat_pipe = Pipeline([
    ('selector',DataFrameSelector(cat_attr)),
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('encoder',OrdinalEncoder()),
])
transformer = ColumnTransformer([
    ('num',num_pipe,num_attr),
    ('cat',cat_pipe,cat_attr),
])
pipe = Pipeline([
    ('prepro',transformer),
])

In [92]:
prepare = pipe.fit_transform(train_data)

In [95]:
df = pd.DataFrame(prepare)

In [99]:
train_data['Age'] = df[0]
train_data['Fare'] = df[1]
train_data['Embarked'] = df[2]+1

In [114]:
temp = train_data.drop(['PassengerId','Ticket'],axis=1)

In [115]:
temp.corr()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
Survived,1.0,-0.338481,0.407753,0.543351,-0.06491,-0.035322,0.081629,0.257307,0.316912,-0.163517
Pclass,-0.338481,1.0,-0.173929,-0.1319,-0.339898,0.083081,0.018443,-0.5495,-0.725541,0.157112
Name,0.407753,-0.173929,1.0,0.502713,-0.091211,0.269623,0.315784,0.13631,0.13339,-0.058549
Sex,0.543351,-0.1319,0.502713,1.0,-0.081163,0.114631,0.245489,0.182333,0.140391,-0.104057
Age,-0.06491,-0.339898,-0.091211,-0.081163,1.0,-0.233296,-0.172482,0.096688,0.240314,-0.014205
SibSp,-0.035322,0.083081,0.269623,0.114631,-0.233296,1.0,0.414838,0.159651,-0.04046,0.066654
Parch,0.081629,0.018443,0.315784,0.245489,-0.172482,0.414838,1.0,0.216225,0.036987,0.038322
Fare,0.257307,-0.5495,0.13631,0.182333,0.096688,0.159651,0.216225,1.0,0.482075,-0.221226
Cabin,0.316912,-0.725541,0.13339,0.140391,0.240314,-0.04046,0.036987,0.482075,1.0,-0.154457
Embarked,-0.163517,0.157112,-0.058549,-0.104057,-0.014205,0.066654,0.038322,-0.221226,-0.154457,1.0


In [127]:
train_data.sort_values(by='Fare',ascending=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
258,259,1,1,2,1,0.433312,0,0,PC 17755,9.667167,0,1.0
737,738,1,1,1,0,0.433312,0,0,PC 17755,9.667167,1,1.0
679,680,1,1,1,0,0.510161,0,1,PC 17755,9.667167,1,1.0
88,89,1,1,2,1,-0.488887,3,2,19950,4.647001,1,3.0
27,28,0,1,1,0,-0.796286,3,2,19950,4.647001,1,3.0
341,342,1,1,2,1,-0.412037,3,2,19950,4.647001,1,3.0
438,439,0,1,1,0,2.661957,1,4,19950,4.647001,1,3.0
311,312,1,1,2,1,-0.873136,2,2,PC 17608,4.634417,1,1.0
742,743,1,1,2,1,-0.642586,2,2,PC 17608,4.634417,1,1.0
118,119,0,1,1,0,-0.412037,0,1,PC 17558,4.335332,1,1.0


In [135]:
X = train_data.drop(['Ticket','PassengerId','Survived','Embarked'],axis=1)
y = train_data['Survived'].copy()

In [136]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [137]:
log_reg.score(X,y)

0.813692480359147

In [141]:
cross_val_score(log_reg,X,y,cv=10)



array([0.83333333, 0.81111111, 0.76404494, 0.84269663, 0.79775281,
       0.76404494, 0.83146067, 0.82022472, 0.83146067, 0.84090909])

In [142]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,1,male,34.5,0,0,330911,7.8292,,Q
1,893,3,3,female,47.0,1,0,363272,7.0000,,S
2,894,2,1,male,62.0,0,0,240276,9.6875,,Q
3,895,3,1,male,27.0,0,0,315154,8.6625,,S
4,896,3,3,female,22.0,1,1,3101298,12.2875,,S
5,897,3,1,male,14.0,0,0,7538,9.2250,,S
6,898,3,2,female,30.0,0,0,330972,7.6292,,Q
7,899,2,1,male,26.0,1,1,248738,29.0000,,S
8,900,3,3,female,18.0,0,0,2657,7.2292,,C
9,901,3,1,male,21.0,2,0,A/4 48871,24.1500,,S
