In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/train.tsv",delimiter="\t")

In [4]:
train.head()

Unnamed: 0,id,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,3,1,1,female,35.0,1,0,53.1,S
1,4,0,3,male,35.0,0,0,8.05,S
2,7,0,3,male,2.0,3,1,21.075,S
3,9,1,2,female,14.0,1,0,30.0708,C
4,11,1,1,female,58.0,0,0,26.55,S


In [5]:
test = pd.read_csv("data/test.tsv",delimiter='\t')

In [6]:
test.head()

Unnamed: 0,id,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,2,3,female,26.0,0,0,7.925,S
3,5,3,male,,0,0,8.4583,Q
4,6,1,male,54.0,0,0,51.8625,S


In [7]:
test.shape

(446, 8)

In [8]:
type(train)

pandas.core.frame.DataFrame

In [9]:
train.isnull().sum()

id           0
survived     0
pclass       0
sex          0
age         85
sibsp        0
parch        0
fare         0
embarked     2
dtype: int64

ageに欠損が多数。あと出港地が二つ。欠損レコードを出力して見たいが……

In [10]:
train.isnull().any()

id          False
survived    False
pclass      False
sex         False
age          True
sibsp       False
parch       False
fare        False
embarked     True
dtype: bool

とりあえず平均値で補完することにする

In [11]:
newage = train['age'].fillna(train['age'].mean())

In [12]:
type(newage)

pandas.core.series.Series

In [13]:
train['age']=newage

In [14]:
train.isnull().any()

id          False
survived    False
pclass      False
sex         False
age         False
sibsp       False
parch       False
fare        False
embarked     True
dtype: bool

In [15]:
y_train=train['survived']

In [16]:
X_train=train[['pclass','age','sibsp','parch','fare']]

ロジスティック回帰分析してみる

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lr = LogisticRegression()

In [19]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
lr.score(X_train,y_train)

0.71011235955056184

# 説明変数に性別を加える。

ダミー変数化して、元のX_trainに連結

In [21]:
dsex=pd.get_dummies(train['sex'])

In [22]:
dsex

Unnamed: 0,female,male
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0
5,0,1
6,0,1
7,1,0
8,0,1
9,1,0


In [23]:
X2_train=pd.concat([X_train,dsex],axis=1)

In [24]:
X2_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,female,male
0,1,35.0,1,0,53.1,1,0
1,3,35.0,0,0,8.05,0,1
2,3,2.0,3,1,21.075,0,1
3,2,14.0,1,0,30.0708,1,0
4,1,58.0,0,0,26.55,1,0


In [25]:
lr = LogisticRegression()
lr.fit(X2_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
lr.score(X2_train,y_train)

0.80449438202247192

## R2が0.8まで上昇。出港地を加えることを考えることにする

出港地がかけてたのは2レコードだけなので、最も多い出港地で補完してみることにする。

In [27]:
train['embarked'].describe()

count     443
unique      3
top         S
freq      325
Name: embarked, dtype: object

topがSといっているので、Sで補完する

In [28]:
emb=train['embarked'].fillna('S')

In [29]:
emb.describe()

count     445
unique      3
top         S
freq      327
Name: embarked, dtype: object

In [30]:
emb.isnull().sum()

0

ダミー変数化しなくちゃ行けなかった。

In [31]:
demb=pd.get_dummies(emb)

In [32]:
demb.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,0,0,1
2,0,0,1
3,1,0,0
4,0,0,1


In [33]:
X3_train=pd.concat([X_train,dsex],axis=1)
X3_train=pd.concat([X3_train, demb],axis=1)
lr = LogisticRegression()
lr.fit(X3_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
lr.score(X3_train, y_train)

0.80674157303370786

In [35]:
X3_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,female,male,C,Q,S
0,1,35.0,1,0,53.1,1,0,0,0,1
1,3,35.0,0,0,8.05,0,1,0,0,1
2,3,2.0,3,1,21.075,0,1,0,0,1
3,2,14.0,1,0,30.0708,1,0,1,0,0
4,1,58.0,0,0,26.55,1,0,0,0,1


##### 教師データで評価してみる

In [36]:
y_train_predicted=lr.predict(X3_train)

In [37]:
(y_train_predicted==y_train).sum()/len(y_train)

0.80674157303370786

## このモデルで投稿してみることにする

In [36]:
test = pd.read_csv("data/test.tsv",delimiter='\t')

In [37]:
test.describe()

Unnamed: 0,id,pclass,age,sibsp,parch,fare
count,446.0,446.0,354.0,446.0,446.0,446.0
mean,434.022422,2.320628,30.194915,0.5,0.331839,30.452381
std,257.820954,0.838873,14.89914,1.002805,0.756823,47.186192
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,215.75,2.0,21.0,0.0,0.0,7.8958
50%,424.5,3.0,28.25,0.0,0.0,13.5
75%,653.75,3.0,39.0,1.0,0.0,30.0
max,890.0,3.0,74.0,8.0,6.0,512.3292


In [38]:
test.isnull().sum()

id           0
pclass       0
sex          0
age         92
sibsp        0
parch        0
fare         0
embarked     0
dtype: int64

In [39]:
test['age']=test['age'].fillna(test['age'].mean())

In [40]:
test.isnull().sum()

id          0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

In [41]:
train.describe()

Unnamed: 0,id,survived,pclass,age,sibsp,parch,fare
count,445.0,445.0,445.0,445.0,445.0,445.0,445.0
mean,456.002247,0.402247,2.296629,29.211583,0.546067,0.431461,33.959971
std,256.703351,0.490903,0.834024,12.72753,1.195247,0.850489,52.079492
min,3.0,0.0,1.0,0.67,0.0,0.0,0.0
25%,227.0,0.0,2.0,22.0,0.0,0.0,7.925
50%,463.0,0.0,3.0,29.211583,0.0,0.0,15.0
75%,679.0,1.0,3.0,35.0,1.0,1.0,31.3875
max,888.0,1.0,3.0,80.0,8.0,5.0,512.3292


In [42]:
test.head()

Unnamed: 0,id,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,2,3,female,26.0,0,0,7.925,S
3,5,3,male,30.194915,0,0,8.4583,Q
4,6,1,male,54.0,0,0,51.8625,S


In [43]:
X3_train.head(2)

Unnamed: 0,pclass,age,sibsp,parch,fare,female,male,C,Q,S
0,1,35.0,1,0,53.1,1,0,0,0,1
1,3,35.0,0,0,8.05,0,1,0,0,1


In [44]:
X_test=test[['pclass','age','sibsp','parch','fare']]
dsex=pd.get_dummies(test['sex'])
demb=pd.get_dummies(test['embarked'])
X_test=pd.concat([X_test,dsex,demb],axis=1)

In [45]:
test=lr.predict(X_test)

In [46]:
pd.DataFrame(test).to_csv("submit.tsv",sep='\t',header=False)

一列目はIDにしないといけなかった。。。

In [47]:
X_test.head(2)

Unnamed: 0,pclass,age,sibsp,parch,fare,female,male,C,Q,S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0


In [48]:
type(test)

numpy.ndarray

In [49]:
Y_ans=pd.DataFrame(test)

In [50]:
test_org=pd.read_csv("data/test.tsv",delimiter='\t')

In [51]:
test_org.head()

Unnamed: 0,id,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,2,3,female,26.0,0,0,7.925,S
3,5,3,male,,0,0,8.4583,Q
4,6,1,male,54.0,0,0,51.8625,S


In [52]:
X_test.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,female,male,C,Q,S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1
3,3,30.194915,0,0,8.4583,0,1,0,1,0
4,1,54.0,0,0,51.8625,0,1,0,0,1


In [53]:
Y_ans.head()

Unnamed: 0,0
0,0
1,1
2,1
3,0
4,0


In [54]:
ans=pd.concat([test_org['id'],Y_ans],axis=1)

In [55]:
ans.head(30)

Unnamed: 0,id,0
0,0,0
1,1,1
2,2,1
3,5,0
4,6,0
5,8,1
6,10,1
7,12,0
8,14,1
9,15,1


In [56]:
ans.to_csv("submit.tsv",sep='\t',header=False,index=False)

In [57]:
type(X_train)

pandas.core.frame.DataFrame