In [99]:
import numpy as np
import IPython
import timeit
import random
import time
from IPython import display
import scipy as sp
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegressionCV,LinearRegression
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

In [100]:
data_raw = pd.read_csv('train.csv')
data_val = pd.read_csv('test.csv')
data1  = data_raw.copy(deep = True)
data_cleaner = [data1, data_val]
print(data1.info())
data1.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
115,116,0,3,"Pekoniemi, Mr. Edvard",male,21.0,0,0,STON/O 2. 3101294,7.925,,S
295,296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C
463,464,0,2,"Milling, Mr. Jacob Christian",male,48.0,0,0,234360,13.0,,S
646,647,0,3,"Cor, Mr. Liudevit",male,19.0,0,0,349231,7.8958,,S
772,773,0,2,"Mack, Mrs. (Mary)",female,57.0,0,0,S.O./P.P. 3,10.5,E77,S
517,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q
734,735,0,2,"Troupiansky, Mr. Moses Aaron",male,23.0,0,0,233639,13.0,,S
749,750,0,3,"Connaghton, Mr. Michael",male,31.0,0,0,335097,7.75,,Q
303,304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S


In [101]:
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', data_val.isnull().sum())
print("-"*10)

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test/Validation columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
----------


In [102]:
data1.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",male,,,,1601.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,,1.0,0.0,,31.0,,


In [103]:
for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)

    #mode取最频繁的值
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)

    #complete missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
drop_column = ['PassengerId','Cabin', 'Ticket']
data1.drop(drop_column, axis=1, inplace = True)

print(data1.isnull().sum())
print("-"*10)
print(data_val.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
----------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [104]:
for dataset in data_cleaner:    
    #Discrete variables
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1

    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1

    #quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]


    #Continuous variable bins; qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
    #Fare Bins/Buckets using qcut or frequency bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)

    #Age Bins/Buckets using cut or value bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)


    
#cleanup rare title names
#print(data1['Title'].value_counts())
stat_min = 10 #while small is arbitrary, we'll use the common minimum in statistics: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
title_names = (data1['Title'].value_counts() < stat_min) #this will create a true false series with title name as index

#apply and lambda functions are quick and dirty code to find and replace with fewer lines of code: https://community.modeanalytics.com/python/tutorial/pandas-groupby-and-python-lambda-functions/
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(data1['Title'].value_counts())
print("-"*10)


#preview data again
data1.info()
data_val.info()
data1.sample(10)

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
FareBin       891 non-null category
AgeBin        891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null 

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
513,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54.0,1,0,59.4,C,2,0,Mrs,"(31, 512.329]","(48, 64]"
300,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,28.0,0,0,7.75,Q,1,1,Miss,"[0, 7.91]","(16, 32]"
543,1,2,"Beane, Mr. Edward",male,32.0,1,0,26.0,S,2,0,Mr,"(14.454, 31]","(16, 32]"
506,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33.0,0,2,26.0,S,3,0,Mrs,"(14.454, 31]","(32, 48]"
609,1,1,"Shutes, Miss. Elizabeth W",female,40.0,0,0,153.4625,S,1,1,Miss,"(31, 512.329]","(32, 48]"
598,0,3,"Boulos, Mr. Hanna",male,28.0,0,0,7.225,C,1,1,Mr,"[0, 7.91]","(16, 32]"
408,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21.0,0,0,7.775,S,1,1,Mr,"[0, 7.91]","(16, 32]"
54,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,61.9792,C,2,0,Mr,"(31, 512.329]","(64, 80]"
766,0,1,"Brewe, Dr. Arthur Jackson",male,28.0,0,0,39.6,C,1,1,Misc,"(31, 512.329]","(16, 32]"
741,0,1,"Cavendish, Mr. Tyrell William",male,36.0,1,0,78.85,S,2,0,Mr,"(31, 512.329]","(32, 48]"


In [124]:
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
print(type(cats))
print(cats)
print(pd.value_counts(cats))

<class 'pandas.core.categorical.Categorical'>
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64


In [127]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
print(cats)
print(pd.value_counts(cats))

[[-3.493, -0.608], (0.0532, 0.81], (0.0532, 0.81], [-3.493, -0.608], (0.81, 2.916], ..., (-0.608, 0.0532], (0.81, 2.916], [-3.493, -0.608], [-3.493, -0.608], (0.0532, 0.81]]
Length: 1000
Categories (4, object): [[-3.493, -0.608] < (-0.608, 0.0532] < (0.0532, 0.81] < (0.81, 2.916]]
(0.81, 2.916]       250
(0.0532, 0.81]      250
(-0.608, 0.0532]    250
[-3.493, -0.608]    250
dtype: int64
