"""
Filename: experimentation8_dataset.ipynb
Author: SPTAU
"""

对数据集进行处理

导入库

In [1]:
import os

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


设置 dataset 地址

In [14]:
TRAIN_PATH = "./dataset/titanic/train.csv"
TEST_PATH = "./dataset/titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "./dataset/titanic/gender_submission.csv"
PROCESSED_TRAIN_PATH = "./dataset/titanic/processed_train.csv"

读取 dataset

In [3]:
train_data = pd.read_csv(TRAIN_PATH)

显示 dataset 信息

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


丢弃 PassengerId 、 Name 、 Ticket 三列数据

In [5]:
train_data = train_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


Age 数据缺失，现在使用均值进行补充

In [6]:
avg_age = train_data['Age'].mean()
train_data['Age'] = train_data['Age'].fillna(avg_age)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


丢弃 Cabin 数据

In [7]:
train_data = train_data.drop(['Cabin'], axis=1)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


查看 Embarked 数据中的众数

In [8]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

Embarked 数据缺失，现在使用众数进行补充

In [9]:
train_data['Embarked'] = train_data['Embarked'].fillna('S')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


查看 Sex 数据的情况

In [10]:
train_data['Sex'].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

将 Sex 数据的值映射成数值

In [11]:
sex_2_dict = {"male": 0, "female":1}
train_data['Sex'] = train_data['Sex'].map(sex_2_dict)
train_data.info()

查看 Embarked 数据的情况

In [12]:
train_data['Embarked'].head()

0    S
1    C
2    S
3    S
4    S
Name: Embarked, dtype: object

将 Embarked 数据的值映射成数值

In [13]:
embarked_2_dict = {"C": 0, "Q": 1, "S": 2}
train_data['Embarked'] = train_data['Embarked'].map(embarked_2_dict)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [16]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,2
1,1,1,1,38.0,1,0,71.2833,0
2,1,3,1,26.0,0,0,7.925,2
3,1,1,1,35.0,1,0,53.1,2
4,0,3,0,35.0,0,0,8.05,2


In [1]:
train_data.to_csv(PROCESSED_TRAIN_PATH, index=False)

NameError: name 'train_data' is not defined