In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/gender_submission.csv')

# 数据清洗
def data_pre(train, test):
    '''数据预处理'''
    # 删除PassengerId，Name，Ticket
    train = train.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)
    test = test.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)
    # 缺失值（Embarked）删除缺失的行
    train = train.dropna(subset='Embarked')
    # 缺失值（Fare）删除缺失的行
    test = test.dropna(subset='Fare')
    return train, test

def encoder1(df):
    '''
    对性别列进行编码
    对Embarked独热编码
    '''
    label_encoder = LabelEncoder()
    # 对性别列进行编码
    df['Sex_encoded'] = label_encoder.fit_transform(df['Sex'])
    df = df.drop('Sex', axis=1)
    # 对Embarked独热编码
    dum1 = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = pd.concat([df, dum1], axis=1)
    df = df.drop('Embarked', axis=1)
    return df

def fill_age(df, sig):
    '''
    使用随机森林模型填充年龄缺失值
    '''
    if sig == 'train':
        df = df.drop('Survived', axis=1)
    # 将数据分为有缺失值和无缺失值两部分
    known_age = df[df['Age'].notnull()]
    unknown_age = df[df['Age'].isnull()]

    # 训练模型
    X_train = known_age.drop('Age', axis=1)
    y_train = known_age['Age']
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    # 预测缺失值
    predicted_age = model.predict(unknown_age.drop('Age', axis=1))
    df.loc[df['Age'].isnull(), 'Age'] = predicted_age
    return df

train, test = data_pre(train, test)
train = encoder1(train)
test = encoder1(test)
train = fill_age(train, 'train')
test = fill_age(test, 'test')

In [12]:
train.info()
display(train.head())
test.info()
display(test.head())

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       889 non-null    int64  
 1   Age          889 non-null    float64
 2   SibSp        889 non-null    int64  
 3   Parch        889 non-null    int64  
 4   Fare         889 non-null    float64
 5   Sex_encoded  889 non-null    int32  
 6   Embarked_C   889 non-null    bool   
 7   Embarked_Q   889 non-null    bool   
 8   Embarked_S   889 non-null    bool   
dtypes: bool(3), float64(2), int32(1), int64(3)
memory usage: 47.7 KB


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_encoded,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,False,False,True
1,1,38.0,1,0,71.2833,0,True,False,False
2,3,26.0,0,0,7.925,0,False,False,True
3,1,35.0,1,0,53.1,0,False,False,True
4,3,35.0,0,0,8.05,1,False,False,True


<class 'pandas.core.frame.DataFrame'>
Index: 417 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       417 non-null    int64  
 1   Age          417 non-null    float64
 2   SibSp        417 non-null    int64  
 3   Parch        417 non-null    int64  
 4   Fare         417 non-null    float64
 5   Sex_encoded  417 non-null    int32  
 6   Embarked_C   417 non-null    bool   
 7   Embarked_Q   417 non-null    bool   
 8   Embarked_S   417 non-null    bool   
dtypes: bool(3), float64(2), int32(1), int64(3)
memory usage: 22.4 KB


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_encoded,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,1,False,True,False
1,3,47.0,1,0,7.0,0,False,False,True
2,2,62.0,0,0,9.6875,1,False,True,False
3,3,27.0,0,0,8.6625,1,False,False,True
4,3,22.0,1,1,12.2875,0,False,False,True


In [8]:
from sklearn.linear_model import LogisticRegression

