In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [24]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/gender_submission.csv')

# 数据清洗
def data_pre(train, test):
    '''数据预处理'''
    # 删除PassengerId，Name，Ticket
    train = train.drop(['Name', 'PassengerId', 'Ticket'], axis=1)
    test = test.drop(['Name', 'PassengerId', 'Ticket'], axis=1)
    # 缺失值（Embarked）删除缺失的行
    train = train.dropna(subset='Embarked')
    # 缺失值（Fare）删除缺失的行
    test = test.dropna(subset='Fare')
    return train, test

train, test = data_pre(train, test)


In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Cabin     202 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 69.5+ KB


In [22]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [23]:
test.info()
# 缺失值填充（age）

<class 'pandas.core.frame.DataFrame'>
Index: 417 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    417 non-null    int64  
 1   Sex       417 non-null    object 
 2   Age       331 non-null    float64
 3   SibSp     417 non-null    int64  
 4   Parch     417 non-null    int64  
 5   Fare      417 non-null    float64
 6   Cabin     91 non-null     object 
 7   Embarked  417 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 29.3+ KB


In [None]:
df = train.copy()
# 将数据分为有缺失值和无缺失值两部分
known_age = df[df['Age'].notnull()]
unknown_age = df[df['Age'].isnull()]

# 训练模型
X_train = known_age.drop('Age', axis=1)
y_train = known_age['Age']
model = RandomForestRegressor()
model.fit(X_train, y_train)

# 预测缺失值
predicted_age = model.predict(unknown_age.drop('Age', axis=1))
df.loc[df['Age'].isnull(), 'Age'] = predicted_age

In [33]:
def encoder(df):
    label_encoder = LabelEncoder()
    # 对性别列进行编码
    df['Sex_encoded'] = label_encoder.fit_transform(df['Sex'])
    return df

df = encoder(train)

In [34]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Sex_encoded
0,0,3,male,22.0,1,0,7.2500,,S,1
1,1,1,female,38.0,1,0,71.2833,C85,C,0
2,1,3,female,26.0,0,0,7.9250,,S,0
3,1,1,female,35.0,1,0,53.1000,C123,S,0
4,0,3,male,35.0,0,0,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S,1
887,1,1,female,19.0,0,0,30.0000,B42,S,0
888,0,3,female,,1,2,23.4500,,S,0
889,1,1,male,26.0,0,0,30.0000,C148,C,1
