In [4]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:300px;  /* your desired max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Import Pandas

In [6]:
import pandas as pd

### Read in the Dataset

In [24]:
data = pd.read_csv('data-titanic.csv')

In [25]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Dataset Info

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 107.4+ KB


### Data Cleanup - Dropping columns

In [27]:
data = data.drop(['name', 'ticket', 'cabin', 'body', 'boat', 'home.dest'], axis=1)

In [28]:
data.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked'],
      dtype='object')

### Data Cleanup - Dropping rows with missing values

In [29]:
data = data.dropna()

In [30]:
data.count()

pclass      1043
survived    1043
sex         1043
age         1043
sibsp       1043
parch       1043
fare        1043
embarked    1043
dtype: int64

### Non-numeric data

In [32]:
data.dtypes

pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object

In [33]:
data.sex.unique()

array(['female', 'male'], dtype=object)

In [35]:
data.embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

### Encode non-numeric categorical data

In [38]:
from sklearn import preprocessing
encoded_data = data.copy()
le = preprocessing.LabelEncoder()

In [39]:
encoded_data.sex = le.fit_transform(encoded_data.sex)
encoded_data.embarked = le.fit_transform(encoded_data.embarked)

### Encoded data

In [40]:
encoded_data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,0,29.0,0,0,211.3375,2
1,1,1,1,0.9167,1,2,151.55,2
2,1,0,0,2.0,1,2,151.55,2
3,1,0,1,30.0,1,2,151.55,2
4,1,0,0,25.0,1,2,151.55,2


In [41]:
encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1308
Data columns (total 8 columns):
pclass      1043 non-null int64
survived    1043 non-null int64
sex         1043 non-null int32
age         1043 non-null float64
sibsp       1043 non-null int64
parch       1043 non-null int64
fare        1043 non-null float64
embarked    1043 non-null int32
dtypes: float64(2), int32(2), int64(4)
memory usage: 65.2 KB


### Separate out features and labels

In [42]:
features = encoded_data.drop(['survived'], axis=1).values
labels = encoded_data['survived'].values

In [43]:
features

array([[   1.    ,    0.    ,   29.    , ...,    0.    ,  211.3375,    2.    ],
       [   1.    ,    1.    ,    0.9167, ...,    2.    ,  151.55  ,    2.    ],
       [   1.    ,    0.    ,    2.    , ...,    2.    ,  151.55  ,    2.    ],
       ..., 
       [   3.    ,    1.    ,   26.5   , ...,    0.    ,    7.225 ,    0.    ],
       [   3.    ,    1.    ,   27.    , ...,    0.    ,    7.225 ,    0.    ],
       [   3.    ,    1.    ,   29.    , ...,    0.    ,    7.875 ,    2.    ]])

In [44]:
labels

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)