In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torchvision import transforms
from tqdm import *
import matplotlib.pyplot as plt
import copy
from torch.autograd.gradcheck import zero_gradients
import pandas  as pd 
import seaborn as sns
import re
import torch.utils.data as data

In [4]:
test = pd.read_csv("./titanic/test.csv")
train = pd.read_csv("./titanic/train.csv")

In [5]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [6]:
test['Embarked'].fillna(
    test.Embarked.mode().values[0], inplace=True)

In [7]:
train['Embarked'].fillna(
    test.Embarked.mode().values[0], inplace=True)

In [8]:
train['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [9]:
test['Age'].fillna(29.699118, inplace=True)
train['Age'].fillna(29.699118, inplace=True)

In [10]:
test['Fare'].fillna(32.204208, inplace=True)

In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [12]:
dummy_fields=['Pclass','Sex','Embarked']
for each in dummy_fields:
    dummies= pd.get_dummies(train[each], prefix= each, drop_first=False)
    train = pd.concat([train, dummies], axis=1)
train.head()    

fields_to_drop=['PassengerId', 'Cabin', 'Pclass', 'Name', 'Sex', 'Ticket', 'Embarked']
df=train.drop(fields_to_drop,axis=1)
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,0,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [13]:
dummy_fields=['Pclass', 'Sex', 'Embarked']
for each in dummy_fields:
    dummies= pd.get_dummies(test[each], prefix= each, drop_first=False)
    test = pd.concat([test, dummies], axis=1)
# train.head()  

fields_to_drop=['PassengerId','Cabin', 'Pclass', 'Name', 'Sex', 'Ticket', 'Embarked']
df_test=test.drop(fields_to_drop,axis=1)
df_test.head()



Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,62.0,0,0,9.6875,0,1,0,0,1,0,1,0
3,27.0,0,0,8.6625,0,0,1,0,1,0,0,1
4,22.0,1,1,12.2875,0,0,1,1,0,0,0,1


In [14]:
to_normalize=['Age','Fare']
for each in to_normalize:
    mean, std= df[each].mean(), df[each].std()
    df.loc[:, each]=(df[each]-mean)/std

df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,-0.592148,1,0,-0.502163,0,0,1,0,1,0,0,1
1,1,0.63843,1,0,0.786404,1,0,0,1,0,1,0,0
2,1,-0.284503,0,0,-0.48858,0,0,1,1,0,0,0,1
3,1,0.407697,1,0,0.420494,1,0,0,1,0,0,0,1
4,0,0.407697,0,0,-0.486064,0,0,1,0,1,0,0,1


In [15]:
to_normalize=['Age','Fare']
for each in to_normalize:
    mean, std= df_test[each].mean(), df_test[each].std()
    df_test.loc[:, each]=(df_test[each]-mean)/std

df_test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.343872,0,0,-0.497662,0,0,1,0,1,0,1,0
1,1.333057,1,0,-0.512511,0,0,1,1,0,0,0,1
2,2.520079,0,0,-0.464383,0,1,0,0,1,0,1,0
3,-0.249639,0,0,-0.482739,0,0,1,0,1,0,0,1
4,-0.645313,1,1,-0.417822,0,0,1,1,0,0,0,1


In [16]:
titanic_train_data_X = df.drop(['Survived'], axis=1)
titanic_train_data_Y = df['Survived']
titanic_test_data = df_test

In [17]:
train_data = torch.from_numpy(titanic_train_data_X.values).float()
train_label = torch.from_numpy(titanic_train_data_Y.values).float()
test_data = torch.from_numpy(titanic_test_data.values).float()

In [45]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# 进行数据封装，用于批训练
train_dataset = TensorDataset(train_data, train_label)
trainLoader = DataLoader(train_dataset, batch_size=4,
                         shuffle=True, num_workers=2)


import torch.nn as nn
import torch.nn.functional as F

# 定义模型
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(titanic_train_data_X.shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 100),
            nn.ReLU(),
            nn.Linear(100, 2),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.fc(x)


net = Net()

In [46]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()  # 定义损失函数
optimizer = optim.Adam(net.parameters(), lr=0.001)  # 定义优化器

import time
start = time.time()
for epoch in range(200):
    running_loss = 0.0
    for i, data in enumerate(trainLoader, 0):
        inputs, labels = data  # 获取数据
        optimizer.zero_grad()  # 清空梯度缓存

        outputs = net(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()  # 反向传播
        optimizer.step()  # 调整模型
        running_loss += loss.item()
        # if i % 20 == 19:
        #     # 每 20 次迭代打印一次信息
        #     print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss/2000))
        #     running_loss = 0.0
print('Finish Traning! Total cost time: ', time.time()-start)

Finish Traning! Total cost time:  130.4761345386505


In [47]:
# 初始化数值
correct = 0
total = 0
# evaluating时不计算导数
with torch.no_grad():
    for data in trainLoader:
        inputs, labels = data
        outputs = net(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %d %%' %
      (correct / total * 100))

Accuracy of the network on the 10000 test images: 88 %


In [48]:
output = torch.max(net(test_data),1)[1]

submission = pd.read_csv('./titanic/gender_submission.csv')
submission['Survived'] = output
submission.to_csv('./titanic/gender_submission.csv', index=False)

Epoach 200 lr =0.001<br>
*   64  50 2 88%   0.74641
*   40 100 2 87%.  0.76076
*   64 100 2 87%.  0.77033
*  80 100 2 88%.  0.74880
*   64 110 2 87%.  0.76555
*   64 125 2 87%.  0.75837
*   64 150 2 88%.  0.73684
<br>

Epoach 200 lr =0.015<br>
*   64 100 2 61%.  0.62200

