
### 1. Data Loading

In [36]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()


In [37]:
def print_info(train_df, test_df):
    print('Number of Training Examples = {}'.format(train_df.shape[0]))
    print('Number of Test Examples = {}\n'.format(test_df.shape[0]))
    print('Training X Shape = {}'.format(train_df.shape))
    print('Training y Shape = {}\n'.format(train_df['Survived'].shape[0]))
    print('Test X Shape = {}'.format(test_df.shape))
    print('Test y Shape = {}\n'.format(test_df.shape[0]))
    print(train_df.columns)
    print(test_df.columns)

train_df = pd.read_csv("C:/Users/tsunt/titanic-hse/data/raw/train.csv")
test_df = pd.read_csv("C:/Users/tsunt/titanic-hse/data/raw/test.csv")

train_df.name = 'Training Set'
test_df.name = 'Test Set'

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
## запомним вывода результатов
passenger_id=test_df['PassengerId']

## фича PassengerID бесполезна для дальнейшего анализа, выбросим
train_df.drop(['PassengerId'], axis=1, inplace=True)
test_df.drop(['PassengerId'], axis=1, inplace=True)

## всего одно пропущенное значение - заполним средним
test_df.Fare.fillna(test_df.Fare.mean(), inplace=True)
data_df = train_df.append(test_df)


train_df=train_df[train_df['Fare']<400]

train_df['Sex'] = train_df.Sex.apply(lambda x: 0 if x == "female" else 1)
test_df['Sex'] = test_df.Sex.apply(lambda x: 0 if x == "female" else 1)

test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

In [39]:
## заполним пропуски в возрасте средним по классифицированным Titles
for name_string in data_df['Name']:
    data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr',
           'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
           'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr',
           'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
data_df.replace({'Title': mapping}, inplace=True)

data_df['Title'].value_counts()
train_df['Title'] = data_df['Title'][:891]
test_df['Title'] = data_df['Title'][891:]

titles = ['Mr', 'Miss', 'Mrs', 'Master', 'Rev', 'Dr']
for title in titles:
    age_to_impute = data_df.groupby('Title')['Age'].mean()[titles.index(title)]
    data_df.loc[(data_df['Age'].isnull()) & (data_df['Title'] == title), 'Age'] = age_to_impute
data_df.isnull().sum()

train_df['Age'] = data_df['Age'][:891]
test_df['Age'] = data_df['Age'][891:]
test_df.isnull().sum()

train_df.to_csv("C:/Users/tsunt/titanic-hse/data/processed/train.csv")
test_df.to_csv("C:/Users/tsunt/titanic-hse/data/processed/test.csv")

### 2. Analysis

In [40]:
train_df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,888.0,888.0,888.0,888.0,888.0,888.0,888.0
mean,0.381757,2.313063,0.647523,30.383851,0.524775,0.381757,30.582164
std,0.486091,0.834007,0.478011,14.875196,1.104186,0.806949,41.176366
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.0,0.0,0.0,7.8958
50%,0.0,3.0,1.0,30.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,43.571429,1.0,0.0,30.77185
max,1.0,3.0,1.0,80.0,8.0,6.0,263.0


<a id="section401"></a>
#### 4.1 Correlation Matrix and Heatmap

In [15]:
train_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
Survived,1.0,-0.334068,-0.545899,-0.175255,-0.033395,0.082157,0.261742
Pclass,-0.334068,1.0,0.132881,-0.289341,0.080937,0.018212,-0.60496
Sex,-0.545899,0.132881,1.0,0.27653,-0.114799,-0.247003,-0.222361
Age,-0.175255,-0.289341,0.27653,1.0,-0.26227,-0.203432,0.056003
SibSp,-0.033395,0.080937,-0.114799,-0.26227,1.0,0.415141,0.211816
Parch,0.082157,0.018212,-0.247003,-0.203432,0.415141,1.0,0.26391
Fare,0.261742,-0.60496,-0.222361,0.056003,0.211816,0.26391,1.0


In [None]:
plt.subplots(figsize = (15,8))
sns.heatmap(train_df.corr(), annot=True,cmap="PiYG")
plt.title("Correlations Among Features", fontsize = 18)

**Positive Correlation Features:**
- Fare and Survived: 0.26.


**Negative Correlation Features:**
- Fare and Pclass: -0.55

- Gender and Survived: -0.54

- Pclass and Survived: -0.34

<a id="section5"></a>
### 3. Feature Engineering

In [41]:
train_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,Mrs
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,Mr


<a id="section501"></a>
#### 5.1 family_size feature

In [42]:
train_df['family_size'] = train_df.SibSp + train_df.Parch+1
test_df['family_size'] = test_df.SibSp + test_df.Parch+1


In [43]:
def family_group(size):
    a = ''
    if (size <= 1):
        a = 'loner'
    elif (size <= 4):
        a = 'small'
    else:
        a = 'large'
    return a

train_df['family_group'] = train_df['family_size'].map(family_group)
test_df['family_group'] = test_df['family_size'].map(family_group)

<a id="section502"></a>
#### 5.2 Is_alone feature

In [44]:
train_df['is_alone'] = [1 if i<2 else 0 for i in train_df.family_size]
test_df['is_alone'] = [1 if i<2 else 0 for i in test_df.family_size]

<a id="section503"></a>

#### 5.3 Child feature

In [45]:

train_df['child'] = [1 if i<16 else 0 for i in train_df.Age]
test_df['child'] = [1 if i<16 else 0 for i in test_df.Age]
train_df.child.value_counts()

0    769
1    119
Name: child, dtype: int64


#### 5.4 Calculated_fare feature

In [46]:
train_df['calculated_fare'] = train_df.Fare/train_df.family_size
test_df['calculated_fare'] = test_df.Fare/test_df.family_size


In [47]:
train_df.calculated_fare.mean()

18.54129191972815

In [48]:
train_df.calculated_fare.mode()

0    13.0
dtype: float64

In [49]:
def fare_group(fare):
    a= ''
    if fare <= 4:
        a = 'Very_low'
    elif fare <= 10:
        a = 'low'
    elif fare <= 20:
        a = 'mid'
    elif fare <= 45:
        a = 'high'
    else:
        a = "very_high"
    return a


In [50]:
train_df['fare_group'] = train_df['calculated_fare'].map(fare_group)
test_df['fare_group'] = test_df['calculated_fare'].map(fare_group)


#### 5.5 Age_group feature

In [51]:
def age_group_fun(age):
    a = ''
    if age <= 1:
        a = 'infant'
    elif age <= 4: 
        a = 'toddler'
    elif age <= 13:
        a = 'child'
    elif age <= 18:
        a = 'teenager'
    elif age <= 35:
        a = 'Young_Adult'
    elif age <= 45:
        a = 'adult'
    elif age <= 55:
        a = 'middle_aged'
    elif age <= 65:
        a = 'senior_citizen'
    else:
        a = 'old'
    return a
        

In [52]:
train_df['age_group'] = train_df['Age'].map(age_group_fun)
test_df['age_group'] = test_df['Age'].map(age_group_fun)

In [53]:
train_df = pd.get_dummies(train_df, columns=['Title',"Pclass",'Embarked', 'family_group', 'fare_group'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Title',"Pclass",'Embarked', 'family_group', 'fare_group'], drop_first=True)
train_df.drop(['Cabin', 'family_size','Ticket','Name', 'Fare'], axis=1, inplace=True)
test_df.drop(['Ticket','Name','family_size',"Fare",'Cabin'], axis=1, inplace=True)

In [54]:
train_df = pd.get_dummies(train_df,columns=['age_group'], drop_first=True)
test_df = pd.get_dummies(test_df,columns=['age_group'], drop_first=True)
#Lets try all after dropping few of the column.
train_df.drop(['Age','calculated_fare'],axis=1,inplace=True)
test_df.drop(['Age','calculated_fare'],axis=1,inplace=True)

train_df.drop(['Title_Rev','age_group_old','age_group_teenager','age_group_senior_citizen','Embarked_Q'],axis=1,inplace=True)
test_df.drop(['Title_Rev','age_group_old','age_group_teenager','age_group_senior_citizen','Embarked_Q'],axis=1,inplace=True)

### 4. Model Creation

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [66]:
X = train_df.drop('Survived', 1)
y = train_df['Survived']

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
testframe = std_scaler.fit_transform(test_df)

  X = train_df.drop('Survived', 1)



#### LogisticRegression

In [67]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000)

logreg = LogisticRegression(solver='liblinear', penalty='l1')
logreg.fit(X_train,y_train)

predict=logreg.predict(X_test)
print(accuracy_score(y_test,predict))
print(confusion_matrix(y_test,predict))
print(precision_score(y_test,predict))
print(recall_score(y_test,predict))

0.8258426966292135
[[91 11]
 [20 56]]
0.835820895522388
0.7368421052631579


**Grid Search on Logistic Regression**

In [57]:
C_vals = [0.0001, 0.001, 0.01, 0.1,0.13,0.2, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 4.0,4.5,5.0,5.1,5.5,6.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

param = {'penalty': penalties, 'C': C_vals, }
grid = GridSearchCV(logreg, param,verbose=False, cv = StratifiedKFold(n_splits=5,random_state=10,shuffle=True), n_jobs=1,scoring='accuracy')

In [58]:
grid.fit(X_train,y_train)
print (grid.best_params_)
print (grid.best_score_)
print(grid.best_estimator_)

{'C': 0.2, 'penalty': 'l2'}
0.8309859154929577
LogisticRegression(C=0.2, solver='liblinear')


In [64]:

logreg_grid = LogisticRegression(penalty=grid.best_params_['penalty'], C=grid.best_params_['C'])
logreg_grid.fit(X_train,y_train)
y_pred = logreg_grid.predict(X_test)
print(X_test.shape)
print(test_df.shape)

logreg_accy = round(accuracy_score(y_test, y_pred), 3)
print (logreg_accy)
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

(178, 23)
(418, 23)
0.837
[[93  9]
 [20 56]]
0.8615384615384616
0.7368421052631579


### 5. Submit test predictions


In [71]:
y_predict=logreg.predict(testframe)
print(y_predict)

temp = pd.DataFrame(pd.DataFrame({
        "PassengerId": passenger_id,
        "Survived": y_predict
    }))


temp.to_csv("C:/Users/tsunt/titanic-hse/data/result/res.csv", index = False)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 1]
