# Titanic passengers

#### Goal: will a person survive or not?

#### Metrics: accuracy

In [None]:

# import libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# read data and get train info

from google.colab import drive
drive.mount('/content/drive')

train = pd.read_csv('/content/drive/My Drive/train.csv')
test = pd.read_csv('/content/drive/My Drive/test.csv')
train.info()


In [None]:
train

In [None]:
# get statisitcs  
train.describe()

In [None]:
# show the overall survival rate 
print('Overall Survival Rate:', round(train['Survived'].mean(), 3))

## Data cleaning and feature selection

In [None]:
# get_dummies function
def dummies(col,train,test):
    train_dum = pd.get_dummies(train[col])
    test_dum = pd.get_dummies(test[col])
    train = pd.concat([train, train_dum], axis=1)
    test = pd.concat([test,test_dum],axis=1)
    train.drop(col,axis=1,inplace=True)
    test.drop(col,axis=1,inplace=True)
    return train, test

# delete the useless cols
dropping = ['PassengerId', 'Name', 'Ticket']
train.drop(dropping,axis=1, inplace=True)
test.drop(dropping,axis=1, inplace=True)

In [None]:
# pclass
# ensure no NA contained
print(train.Pclass.value_counts(dropna=False))
sns.factorplot('Pclass', 'Survived',data=train, order=[1,2,3])
# according to the graph, we found there are huge differences between
# each pclass group. keep the ft
train, test = dummies('Pclass', train, test)

In [None]:
# sex
print(train.Sex.value_counts(dropna=False))
sns.factorplot('Sex','Survived', data=train)
# female survival rate is way better than the male
train, test = dummies('Sex', train, test)

#train.drop('male',axis=1,inplace=True)
#test.drop('male',axis=1,inplace=True)

In [None]:
#age 
#dealing the missing data
nan_num = train['Age'].isnull().sum()
print(nan_num)

In [None]:
# there are 177 missing value, fill with random int
age_mean = train['Age'].mean()
age_std = train['Age'].std()
filling = np.random.randint(age_mean-age_std, age_mean+age_std, size=nan_num)
train['Age'][train['Age'].isnull()==True] = filling
nan_num = train['Age'].isnull().sum()
print(nan_num)

In [None]:
# dealing the missing val in test
nan_num = test['Age'].isnull().sum()

In [None]:
# 86 null
age_mean = test['Age'].mean()
age_std = test['Age'].std()
filling = np.random.randint(age_mean-age_std,age_mean+age_std,size=nan_num)
test['Age'][test['Age'].isnull()==True]=filling
nan_num = test['Age'].isnull().sum()
print(nan_num)

In [None]:
#family
print(train['SibSp'].value_counts(dropna=False))
print(train['Parch'].value_counts(dropna=False))

sns.factorplot('SibSp','Survived',data=train,size=5)
sns.factorplot('Parch','Survived',data=train,size=5)


In [None]:
'''through the plot, we suggest that with more family member, 
the survival rate will drop, we can create the new col
add up the parch and sibsp to check our theory''' 

train['family'] = train['SibSp'] + train['Parch']
test['family'] = test['SibSp'] + test['Parch']
sns.factorplot('family','Survived',data=train,size=5)

train.drop(['SibSp','Parch'],axis=1,inplace=True)
test.drop(['SibSp','Parch'],axis=1,inplace=True)

In [None]:
# fare
print(train.Fare.isnull().sum())
print(test.Fare.isnull().sum())

In [None]:
sns.factorplot('Survived','Fare',data=train,size=5)
test['Fare'].fillna(test['Fare'].median(),inplace=True)

In [None]:
#Cabin
# checking missing val
# 687 out of 891 are missing, drop this col
train.Cabin.isnull().sum()

In [None]:
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)

In [None]:
#Embark
print(train.Embarked.isnull().sum())
# 2 missing value
train.Embarked.value_counts()

In [None]:
# fill the majority val,'s', into missing val col
train['Embarked'].fillna('S',inplace=True)

sns.factorplot('Embarked','Survived',data=train,size=6)
train,test = dummies('Embarked',train,test)


## Model and prediction

In [None]:
# import machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X=train.drop('Survived',axis=1)
y=train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)


# check classification scores of logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


print('Train/Test split results:')
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))


## CrossValidation


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


clf = svm.SVC(kernel='linear').fit(X_train, y_train)
clf.score(X_test, y_test)


In [None]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', random_state=42)
scores = cross_val_score(clf, X, y, cv=15)
scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [None]:
from sklearn import metrics
scores = cross_val_score(
    clf, X, y, cv=5, scoring='recall_macro')
scores
