In [77]:
import numpy as np
import pandas as pd

In [78]:
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [79]:
train.shape,test.shape

((891, 12), (418, 11))

In [80]:
data = train.append(test,ignore_index=True)   #把原始数据和预测数据合并在一起，方便同时对这两个数据集进行清洗

In [81]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [82]:
data.info()   
#我们看到数据总共有1309行。其中数据类型列：年龄（age）、票价（fare）里面有缺失数据。
#字符串列：登船港口（embarked）、船舱号（cabin）里面有缺失数据。

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


各特征的含义：
PassengerId乘客编号；Survived生存情况；

Pclass客舱等级：1=1等舱，2=2等舱，3=3等舱；

SibSp船上兄弟姐妹数或者配偶数（也叫同代直系亲属人数）；Parch船上父母数或子女数（也叫不同代直系亲属人数）；Ticket票号；Fare票价；Cabin客舱号；

Embarked登船港口：出发地点S=英国南安普顿Southampton，途径地点1：C=法国 瑟堡市Cherbourg，途经地点2：Q=爱尔兰 昆士敦Queenstown

# 数据清洗

缺失值处理

In [83]:
#针对Age和Fare（两个连续型变量） ，均值填充
data['Age'] = data['Age'].fillna(data['Age'].mean())
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())

In [84]:
#针对Cabin（客舱号）：缺失值过多，可不使用此特征
#针对Embarked（登船港口）：使用众数填充
data['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [85]:
data['Embarked'] = data['Embarked'].fillna('S')

# 特征工程

数据和特征决定了机器学习的上限，而模型和算法只是逼近这个上限而已

特征工程就是最大限度地从原始数据中提取特征，以供机器学习算法和模型使用。所以特征选取的好坏会直接影响模型的效果。进行特征工程的方法=特征提取+特征选择和特征降维

三种数据类型：
1.数值数据（定量数据）：就是可以用数字来衡量的数据，可以是离散的也可以是连续的数据
2.分类数据（定性数据）：对事物进行描述，无法用数据进行量化（例如1代表男，0代表女）
3.时间序列数据：一段时间内定期收集的数字序列（如一个月内某股票的股价）

常用的特征提取方法是查看数据类型：数值数据---直接使用；分类数据---用数值来代替类别（超过2种类别使用one-hot编码）；时间序列数据---转化成单独的年、月、日。

one-hot编码：如果原始数据中有N种（超过2种）类别，那么我们将这一个特征扩展为N种特征，当原始数据是第i类别时，则这一类别扩展对应的第i个特征为1，其他都扩展成特征为0。通过one-hot编码后得到的新特征我们称之为虚拟变量/哑变量（dummy variables）。

In [86]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,29.881138,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,29.881138,0,0,359309,8.0500,,S


In [87]:
data['Sex'] = data['Sex'].map({'male':1,'female':0})

In [88]:
data['Sex']

0       1
1       0
2       0
3       0
4       1
       ..
1304    1
1305    0
1306    1
1307    1
1308    1
Name: Sex, Length: 1309, dtype: int64

In [89]:
Pclass_df = pd.get_dummies(data['Pclass'],prefix='Pclass')  #prefix前缀
Pclass_df

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1
...,...,...,...
1304,0,0,1
1305,1,0,0
1306,0,0,1
1307,0,0,1


In [90]:
Embarked_df = pd.get_dummies(data['Embarked'],prefix='Embarked')  #prefix前缀
Embarked_df

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
1304,0,0,1
1305,1,0,0
1306,0,0,1
1307,0,0,1


In [91]:
data = pd.concat([data,Pclass_df,Embarked_df],axis=1)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.000000,1,0,A/5 21171,7.2500,,S,0,0,1,0,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.000000,1,0,PC 17599,71.2833,C85,C,1,0,0,1,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0,0,1,0,0,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.000000,1,0,113803,53.1000,C123,S,1,0,0,0,0,1
4,5,0.0,3,"Allen, Mr. William Henry",1,35.000000,0,0,373450,8.0500,,S,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",1,29.881138,0,0,A.5. 3236,8.0500,,S,0,0,1,0,0,1
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",0,39.000000,0,0,PC 17758,108.9000,C105,C,1,0,0,1,0,0
1306,1307,,3,"Saether, Mr. Simon Sivertsen",1,38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1,0,0,1
1307,1308,,3,"Ware, Mr. Frederick",1,29.881138,0,0,359309,8.0500,,S,0,0,1,0,0,1


然后处理无直接类别但可能从里面提取出类别的字符串类型

1）乘客姓名（Name）

In [92]:
#同样的查看姓名这一列的数据你可以看到‘名’之后就是‘头衔’（称谓）
data['Name']

0                                 Braund, Mr. Owen Harris
1       Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                  Heikkinen, Miss. Laina
3            Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                                Allen, Mr. William Henry
                              ...                        
1304                                   Spector, Mr. Woolf
1305                         Oliva y Ocana, Dona. Fermina
1306                         Saether, Mr. Simon Sivertsen
1307                                  Ware, Mr. Frederick
1308                             Peter, Master. Michael J
Name: Name, Length: 1309, dtype: object

In [94]:
#定义函数：从姓名中获取头衔
def getTitle(name):
    str1 = name.split(',')[1]   #Mr. Owen Harris
    str2 = str1.split('.')[0]   #Mr
    str3 = str2.strip()         #strip() 方法用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列
    return str3

In [108]:
#存放提取后的特征
title_df = pd.DataFrame()
title_df['Title'] = data['Name'].map(getTitle)
title_df

Unnamed: 0,Title
0,Mr
1,Mrs
2,Miss
3,Mrs
4,Mr
...,...
1304,Mr
1305,Dona
1306,Mr
1307,Mr


In [109]:
#查看提取的‘头衔’特征中到底有哪些种头衔
title_df['Title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Ms                2
Mlle              2
Major             2
Sir               1
Jonkheer          1
Lady              1
Capt              1
Mme               1
Don               1
Dona              1
the Countess      1
Name: Title, dtype: int64

可以从网上搜素这些头衔的含义：

Mr.= mister，先生

Sir : 先生，长官 ,爵士(冠于爵士或准男爵名字之前的尊称)

Mrs.= mistress，太太/夫人

Miss：小姐，女士，年轻未婚女子；复数为misses

Ms ：已婚或未婚女子姓或姓名前的称呼

Madame：简写是Mme.,复数是mesdames(简写是Mme)

Mlle：小姐

Lady：女士，指成年女子，有些人尤其是长者认为这样说比较礼貌

Master : 大师，院长，主人等意思

Dona，是西班牙语对女子的称谓，相当于英语的 Lady Master，佣人对未成年男少主人的称呼,相当于汉语的"少爷"。

Don，n. <西>（置于男士名字前的尊称）先生，阁下；指导教师，大学教师

Jonkheer是贵族（维基百科的解释是：Jonkheer is a Dutch honorific of nobility.）

Rev.= reverend，牧师（对基督教教士的称谓，用于姓名前）

Dr.= doctor，医生/博士

Col.=Colonel，上校

Capt =Captain, 有船长，上尉的意思

major，有陆军少校的意思

The Countess：女伯爵



我们将以上18中头衔，归纳为以下几种类别：

Officer政府官员：capt , col , major, Dr, Rev

Royalty王室（皇室）: jonkheer , Don , Sir , the Countess , Dona, Lady

Mr已婚男士: Mr

Mrs已婚妇女 : Mme , Ms , Mrs ,

Miss年轻未婚女子: Mlle , Miss

Master有技能的人/教师: Master

In [110]:
#姓名中头衔字符串与定义头衔类别的映射关系
title_dict = {
    "Mr":"Mr",
    "Miss":"Miss",
    "Mrs":"Mrs",
    "Master":"Master",
    "Dr":"Officer",
    "Rev":"Officer",
    "Col":"Officer",
    "Ms":"Miss",
    "Mlle":"Miss",
    "Major":"Officer",
    "Sir":"Royalty",
    "Jonkheer":"Royalty",
    "Lady":"Royalty",
    "Capt":"Officer",
    "Mme":"Mrs",
    "Don":"Royalty",
    "Dona":"Royalty",
    "the Countess":"Royalty"
}

title_df['Title'] = title_df['Title'].map(title_dict) 
title_df['Title']

0            Mr
1           Mrs
2          Miss
3           Mrs
4            Mr
         ...   
1304         Mr
1305    Royalty
1306         Mr
1307         Mr
1308     Master
Name: Title, Length: 1309, dtype: object

In [113]:
#至此，从姓名中提取了6种类别，同样用one-hot编码，使姓名的类别用数值来代替
title_df = pd.get_dummies(title_df['Title'],prefix='Title')
title_df

Unnamed: 0,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,0,1,0,0,0
...,...,...,...,...,...,...
1304,0,0,1,0,0,0
1305,0,0,0,0,0,1
1306,0,0,1,0,0,0
1307,0,0,1,0,0,0


接下来看可以直接使用的数值数据。

数值数据有： 乘客编号（PassengerId），年龄（Age），船票价格（Fare），船上兄弟姐妹数/配偶数（也叫同代直系亲属人数）（SibSp），船上父母数/子女数（也叫不同代直系亲属人数）（Parch）

乘客编号：用来统计有多少乘客，没有使用价值。

年龄：不需要进一步处理，直接使用。

船票价格：不需要进一步不处理，直接使用（实际上船票价格和船舱等级（Pclass）可以等价，因为船票的高低对应了船舱等级好坏）



还剩两个即SibSp和Parch，可以看到他们属于一个大类---家庭（Family）

对于家庭，我们可以根据人数多少对家庭类别进行划分：

小家庭Family_Single：家庭人数=1
中等家庭Family_Small : 2<=家庭人数<=4
大家庭Family_Large : 家庭人数>=5
而家庭人数=同代直系亲属数（Parch）+不同带直系亲属数（SibSp）+乘客自己

In [116]:
#存放家庭信息
family_df = pd.DataFrame()
#家庭人数
family_df['FamilySize'] = data['SibSp']+data['Parch']+1
family_df['FamilySize']

0       2
1       2
2       1
3       2
4       1
       ..
1304    1
1305    1
1306    1
1307    1
1308    3
Name: FamilySize, Length: 1309, dtype: int64

In [117]:
family_df['FamilySingle'] = family_df['FamilySize'].map(lambda x:1 if x==1 else 0)
family_df['FamilySmall'] = family_df['FamilySize'].map(lambda x:1 if 2<=x<=4 else 0)
family_df['FamilyLarge'] = family_df['FamilySize'].map(lambda x:1 if x>=5 else 0)
family_df

Unnamed: 0,FamilySize,FamilySingle,FamilySmall,FamilyLarge
0,2,0,1,0
1,2,0,1,0
2,1,1,0,0
3,2,0,1,0
4,1,1,0,0
...,...,...,...,...
1304,1,1,0,0
1305,1,1,0,0
1306,1,1,0,0
1307,1,1,0,0


In [119]:
data = pd.concat([data,title_df,family_df],axis=1)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,FamilySize,FamilySingle,FamilySmall,FamilyLarge
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,...,0,0,1,0,0,0,2,0,1,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,...,0,0,0,1,0,0,2,0,1,0
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,...,0,1,0,0,0,0,1,1,0,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,...,0,0,0,1,0,0,2,0,1,0
4,5,0.0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,...,0,0,1,0,0,0,1,1,0,0


In [121]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty',
       'FamilySize', 'FamilySingle', 'FamilySmall', 'FamilyLarge'],
      dtype='object')

In [122]:
df = data[['Sex', 'Age', 'SibSp','Parch', 'Fare','Pclass_1', 'Pclass_2','Pclass_3', 
           'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master','Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 
           'Title_Royalty','FamilySingle', 'FamilySmall', 'FamilyLarge','Survived']]

# 构建模型

构建模型前，我们需要选择使用哪些特征来训练模型（特征选择的重要性前文已提到，不多赘述）

这里我们使用相关系数来选择特征：

In [123]:
#查看各个特征与生存情况（Survived）的相关系数
corr_df = df.corr()
corr_df 

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,FamilySingle,FamilySmall,FamilyLarge,Survived
Sex,1.0,0.057397,-0.109609,-0.213125,-0.185484,-0.107371,-0.028862,0.116562,-0.066564,-0.088651,...,0.164375,-0.676028,0.870678,-0.567801,0.087288,-0.020408,0.284537,-0.255196,-0.077748,-0.543351
Age,0.057397,1.0,-0.190747,-0.130872,0.171521,0.362587,-0.014193,-0.302093,0.076179,-0.012718,...,-0.363923,-0.253701,0.165476,0.199221,0.162818,0.059466,0.116675,-0.038189,-0.16121,-0.070323
SibSp,-0.109609,-0.190747,1.0,0.373587,0.160224,-0.034256,-0.052419,0.07261,-0.048396,-0.048678,...,0.329171,0.075519,-0.243104,0.063941,-0.013813,-0.010787,-0.591077,0.25359,0.699681,-0.035322
Parch,-0.213125,-0.130872,0.373587,1.0,0.221522,-0.013033,-0.010057,0.019521,-0.008635,-0.100943,...,0.253482,0.064589,-0.30478,0.216271,-0.032631,-0.030197,-0.549022,0.248532,0.624627,0.081629
Fare,-0.185484,0.171521,0.160224,0.221522,1.0,0.599956,-0.121372,-0.419616,0.286241,-0.130054,...,0.011596,0.090101,-0.192192,0.141701,0.028696,0.026214,-0.274826,0.197281,0.170853,0.257307
Pclass_1,-0.107371,0.362587,-0.034256,-0.013033,0.599956,1.0,-0.296526,-0.622172,0.325722,-0.166101,...,-0.084504,-0.013879,-0.099725,0.144126,0.098788,0.118561,-0.126551,0.165965,-0.067523,0.285904
Pclass_2,-0.028862,-0.014193,-0.052419,-0.010057,-0.121372,-0.296526,1.0,-0.56318,-0.134675,-0.121973,...,-0.016933,-0.022679,-0.038595,0.068386,0.07307,-0.035156,-0.035075,0.09727,-0.118495,0.093349
Pclass_3,0.116562,-0.302093,0.07261,0.019521,-0.419616,-0.622172,-0.56318,1.0,-0.17143,0.243706,...,0.086998,0.0306,0.117925,-0.180765,-0.145375,-0.073765,0.13825,-0.223338,0.15556,-0.322308
Embarked_C,-0.066564,0.076179,-0.048396,-0.008635,0.286241,0.325722,-0.134675,-0.17143,1.0,-0.164166,...,-0.014172,-0.016251,-0.065538,0.10096,0.003678,0.077213,-0.107874,0.159594,-0.092825,0.16824
Embarked_Q,-0.088651,-0.012718,-0.048678,-0.100943,-0.130054,-0.166101,-0.121973,0.243706,-0.164166,1.0,...,-0.009091,0.203538,-0.080224,-0.106723,-0.003212,-0.021853,0.127214,-0.122491,-0.018423,0.00365


In [126]:
corr_df['Survived'].map(lambda x:abs(x)).sort_values(ascending=False) 

Survived         1.000000
Title_Mr         0.549199
Sex              0.543351
Title_Mrs        0.341994
Title_Miss       0.335636
Pclass_3         0.322308
Pclass_1         0.285904
FamilySmall      0.279855
Fare             0.257307
FamilySingle     0.203367
Embarked_C       0.168240
Embarked_S       0.149683
FamilyLarge      0.125147
Pclass_2         0.093349
Title_Master     0.085221
Parch            0.081629
Age              0.070323
SibSp            0.035322
Title_Royalty    0.033391
Title_Officer    0.031316
Embarked_Q       0.003650
Name: Survived, dtype: float64

特征选择

In [128]:
df1 = df[['Title_Mr','Title_Mrs','Title_Miss','Pclass_3','Pclass_1','FamilySmall','Fare','FamilySingle','Embarked_C','Embarked_S','FamilyLarge','Survived']]

In [129]:
train = df1[~df1['Survived'].isnull()]
train.head()

Unnamed: 0,Title_Mr,Title_Mrs,Title_Miss,Pclass_3,Pclass_1,FamilySmall,Fare,FamilySingle,Embarked_C,Embarked_S,FamilyLarge,Survived
0,1,0,0,1,0,1,7.25,0,0,1,0,0.0
1,0,1,0,0,1,1,71.2833,0,1,0,0,1.0
2,0,0,1,1,0,0,7.925,1,0,1,0,1.0
3,0,1,0,0,1,1,53.1,0,0,1,0,1.0
4,1,0,0,1,0,0,8.05,1,0,1,0,0.0


In [131]:
test = df1[df1['Survived'].isnull()]
del test['Survived']
test.head()

Unnamed: 0,Title_Mr,Title_Mrs,Title_Miss,Pclass_3,Pclass_1,FamilySmall,Fare,FamilySingle,Embarked_C,Embarked_S,FamilyLarge
891,1,0,0,1,0,0,7.8292,1,0,0,0
892,0,1,0,1,0,1,7.0,0,0,1,0
893,1,0,0,0,0,0,9.6875,1,0,0,0
894,1,0,0,1,0,0,8.6625,1,0,1,0
895,0,1,0,1,0,1,12.2875,0,0,1,0


# 模型训练

划分训练集和测试集

In [132]:
X = train.iloc[:,:-1]
Y = train.iloc[:,-1]

In [136]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [137]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)
model.score(x_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8603351955307262

In [138]:
model.predict(test)

array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

随机森林

In [139]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.8268156424581006