In [20]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn import preprocessing

from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB # 高斯朴素贝叶斯 GaussianNB/MultinomialNB/BernoulliNB
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.ensemble import  AdaBoostClassifier # AdaBoost
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  

from tpot import TPOTClassifier

In [21]:
# 数据加载
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [22]:
# 数据探索
# 查看train_data信息
#pd.set_option('display.max_columns', None) #显示所有列

print('查看数据信息：列名、非空个数、类型等')

print(train_data.info())

print('-'*30)

print('查看数据摘要')

print(train_data.describe())

print('-'*30)

print('查看离散数据分布')

print(train_data.describe(include=['O']))

print('-'*30)

print('查看前5条数据')

print(train_data.head())

print('-'*30)

print('查看后5条数据')

print(train_data.tail())


查看数据信息：列名、非空个数、类型等
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
查看数据摘要
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642

In [23]:
# 使用平均年龄来填充年龄中的nan值

train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)

test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)


# 使用票价的均值填充票价中的nan值

train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)

test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)


print(train_data['Embarked'].value_counts())

# 使用登录最多的港口来填充登录港口的nan值

train_data['Embarked'].fillna('S', inplace=True)

test_data['Embarked'].fillna('S',inplace=True)

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [24]:
# 特征选择

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

train_features = train_data[features]

train_labels = train_data['Survived']

test_features = test_data[features]

print('特征值')

print(train_features)

特征值
     Pclass     Sex        Age  SibSp  Parch     Fare Embarked
0         3    male  22.000000      1      0   7.2500        S
1         1  female  38.000000      1      0  71.2833        C
2         3  female  26.000000      0      0   7.9250        S
3         1  female  35.000000      1      0  53.1000        S
4         3    male  35.000000      0      0   8.0500        S
..      ...     ...        ...    ...    ...      ...      ...
886       2    male  27.000000      0      0  13.0000        S
887       1  female  19.000000      0      0  30.0000        S
888       3  female  29.699118      1      2  23.4500        S
889       1    male  26.000000      0      0  30.0000        C
890       3    male  32.000000      0      0   7.7500        Q

[891 rows x 7 columns]


In [25]:
dvec=DictVectorizer(sparse=False)

train_features=dvec.fit_transform(train_features.to_dict(orient='record'))
test_features=dvec.transform(test_features.to_dict(orient='record'))

print(dvec.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


In [26]:
# 特征归一化
scaler = preprocessing.StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.fit_transform(test_features)

In [28]:
# 构造svm

# clf = SVC(kernel='rbf', C=1.0, gamma='auto')
clf = SVC(kernel='linear', max_iter=1000)
# 训练

clf.fit(train_features, train_labels)


# 预测

pred_labels = clf.predict(test_features)



# 得到准确率(基于训练集)

acc_decision_tree = round(clf.score(train_features, train_labels), 6)

print(u'score准确率为 %.4lf' % acc_decision_tree)



# 使用K折交叉验证 统计准确率

print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

score准确率为 0.7890
cross_val_score准确率为 0.7867




In [14]:
# 构造lr

clf = LogisticRegression(solver='liblinear', multi_class='auto') #数据集比较小，使用liblinear，数据集大使用 sag或者saga
clf.fit(train_features, train_labels)

# 预测

pred_labels = clf.predict(test_features)



# 得到准确率(基于训练集)

acc_decision_tree = round(clf.score(train_features, train_labels), 6)

print(u'score准确率为 %.4lf' % acc_decision_tree)



# 使用K折交叉验证 统计准确率

print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

score准确率为 0.8002
cross_val_score准确率为 0.7957


In [15]:
# 构造贝叶斯分类器

clf = GaussianNB()
clf.fit(train_features, train_labels)

# 预测

pred_labels = clf.predict(test_features)



# 得到准确率(基于训练集)

acc_decision_tree = round(clf.score(train_features, train_labels), 6)

print(u'score准确率为 %.4lf' % acc_decision_tree)



# 使用K折交叉验证 统计准确率

print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

score准确率为 0.7912
cross_val_score准确率为 0.7845


In [16]:
# 构造KNN分类器

clf = KNeighborsClassifier()
clf.fit(train_features, train_labels)

# 预测

pred_labels = clf.predict(test_features)



# 得到准确率(基于训练集)

acc_decision_tree = round(clf.score(train_features, train_labels), 6)

print(u'score准确率为 %.4lf' % acc_decision_tree)



# 使用K折交叉验证 统计准确率

print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

score准确率为 0.8575
cross_val_score准确率为 0.8014


In [17]:
# 构造LDA

clf =model = LinearDiscriminantAnalysis(n_components=1)

clf.fit(train_features, train_labels)

# 预测

pred_labels = clf.predict(test_features)



# 得到准确率(基于训练集)

acc_decision_tree = round(clf.score(train_features, train_labels), 6)

print(u'score准确率为 %.4lf' % acc_decision_tree)



# 使用K折交叉验证 统计准确率

print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

score准确率为 0.7991
cross_val_score准确率为 0.7924


In [18]:
# 构造ID3决策树

clf = DecisionTreeClassifier(criterion='entropy')

# 训练

clf.fit(train_features, train_labels)


# 预测

pred_labels = clf.predict(test_features)



# 得到准确率(基于训练集)

acc_decision_tree = round(clf.score(train_features, train_labels), 6)

print(u'score准确率为 %.4lf' % acc_decision_tree)


# 使用K折交叉验证 统计准确率

print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

score准确率为 0.9820
cross_val_score准确率为 0.7767


In [19]:
# 构造TPOT

clf = TPOTClassifier(generations=5, population_size=20, verbosity=2)

# 训练

clf.fit(train_features, train_labels)



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.8260686711443098
Generation 2 - Current best internal CV score: 0.8282970309459543
Generation 3 - Current best internal CV score: 0.8282970309459543
Generation 4 - Current best internal CV score: 0.832803967108154
Generation 5 - Current best internal CV score: 0.838390559286925

Best pipeline: GradientBoostingClassifier(VarianceThreshold(CombineDFs(input_matrix, input_matrix), threshold=0.005), learning_rate=0.1, max_depth=9, max_features=0.25, min_samples_leaf=12, min_samples_split=5, n_estimators=100, subsample=0.25)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=20,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)