## Exploring and Processing Data - Part 1

In [46]:
# imports
import pandas as pd
import numpy as np
import matplotlib
import os

### Import Data

In [3]:
# set the path of the raw data
raw_data_path = os.path.join(os.path.pardir,'data','raw')
train_file_path = os.path.join(raw_data_path,'train.csv')
test_file_path = os.path.join(raw_data_path,'test.csv')

In [4]:
# read the data with all default parameters
train_df = pd.read_csv(train_file_path,index_col='PassengerId')
test_df = pd.read_csv(test_file_path,index_col='PassengerId')

In [5]:
# get the type
type(train_df)

pandas.core.frame.DataFrame

### Basic Structure

In [7]:
# use .info() to get brief information about the dataframe
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [20]:
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
Pclass      418 non-null int64
Name        418 non-null object
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Ticket      418 non-null object
Fare        417 non-null float64
Cabin       91 non-null object
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [9]:
test_df['Survived'] = -888 # Adding Survived with a default value

In [10]:
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,-888
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,-888
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,-888
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,-888
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,-888


In [14]:
df = pd.concat((train_df, test_df),axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    1309 non-null int64
Ticket      1309 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB


In [16]:
# column selection using dot
df.Name.head()

PassengerId
1                              Braund, Mr. Owen Harris
2    Cumings, Mrs. John Bradley (Florence Briggs Th...
3                               Heikkinen, Miss. Laina
4         Futrelle, Mrs. Jacques Heath (Lily May Peel)
5                             Allen, Mr. William Henry
Name: Name, dtype: object

In [18]:
# selecting multiple columns using a list of column name strings
df[['Name','Age']].head()

Unnamed: 0_level_0,Name,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",22.0
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
3,"Heikkinen, Miss. Laina",26.0
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
5,"Allen, Mr. William Henry",35.0


In [19]:
# indexing : use loc for label based indexing
# all columns
df.loc[5:10,]

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0,373450
6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,0,330877
7,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,1,male,0,0,17463
8,2.0,,S,21.075,"Palsson, Master. Gosta Leonard",1,3,male,3,0,349909
9,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3,female,0,1,347742
10,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2,female,1,1,237736


In [23]:
# selecting column range 
df.loc[5:10, 'Age':'Pclass']

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3
6,,,Q,8.4583,"Moran, Mr. James",0,3
7,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,1
8,2.0,,S,21.075,"Palsson, Master. Gosta Leonard",1,3
9,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3
10,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2


In [24]:
# selecting discrete columns 
df.loc[5:10,['Survived','Fare','Embarked']]

Unnamed: 0_level_0,Survived,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,0,8.05,S
6,0,8.4583,Q
7,0,51.8625,S
8,0,21.075,S
9,1,11.1333,S
10,1,30.0708,C


In [25]:
# indexing : use iloc for position based indexing
df.iloc[5:10,3:8]

Unnamed: 0_level_0,Fare,Name,Parch,Pclass,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,8.4583,"Moran, Mr. James",0,3,male
7,51.8625,"McCarthy, Mr. Timothy J",0,1,male
8,21.075,"Palsson, Master. Gosta Leonard",1,3,male
9,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3,female
10,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2,female


In [34]:
# filter rows based on the condition
male_passengers = df.loc[df.Sex == 'male',:]
num_male_passengers = len(male_passengers)
print("Number of male passengers:{0}".format(num_male_passengers))

Number of male passengers:843


In [35]:
# use & or | operations to build complex logic
male_passengers_first_class = df.loc[((df.Sex == 'male') & (df.Pclass == 1)),:]
print('Number of male passengers in first class: {0}'.format(len(male_passengers_first_class)))

Number of male passengers in first class: 179


### Summary Statistics

In [36]:
# use describe() to get statistics for all numeric columns
df.describe()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0
mean,29.881138,33.295479,0.385027,2.294882,0.498854,-283.301757
std,14.413493,51.758668,0.86556,0.837836,1.041658,414.337413
min,0.17,0.0,0.0,1.0,0.0,-888.0
25%,21.0,7.8958,0.0,2.0,0.0,-888.0
50%,28.0,14.4542,0.0,3.0,0.0,0.0
75%,39.0,31.275,0.0,3.0,1.0,1.0
max,80.0,512.3292,9.0,3.0,8.0,1.0


In [37]:
# numerical feature
# centrality measures
print('Mean fare : {0}'.format(df.Fare.mean())) # mean
print('Median fare :{0}'.format(df.Fare.median())) # median

Mean fare : 33.2954792813
Median fare :14.4542


In [38]:
# dispersion measures
print('Min fare: {0}'.format(df.Fare.min())) # minimum
print('Max fare: {0}'.format(df.Fare.max())) # maximum
print('Fare range: {0}'.format(df.Fare.max()-df.Fare.min())) # range
print('25 percentile:{0}'.format(df.Fare.quantile(.25))) # 25 percentile
print('50 percentile:{0}'.format(df.Fare.quantile(.5))) # 50 percentile
print('75 percentile:{0}'.format(df.Fare.quantile(.75))) # 75 percentile
print('Variance fare:{0}'.format(df.Fare.var())) # var
print('Standard deviation fare:{0}'.format(df.Fare.std())) # standard deviation

Min fare: 0.0
Max fare: 512.3292
Fare range: 512.3292
25 percentile:7.8958
50 percentile:14.4542
75 percentile:31.275
Variance fare:2678.95973789
Standard deviation fare:51.7586682392


In [47]:
%matplotlib inline

In [50]:
# box-whisker plot
df.Fare.plot(kind='box')

ImportError: matplotlib is required for plotting.