# Step1：数据准备

In [44]:
# 导入依赖库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# 读取数据
data = pd.read_csv('penguins_raw.csv')
data = data[['Culmen Length (mm)','Culmen Depth (mm)',
            'Flipper Length (mm)','Body Mass (g)','Species']]
data

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Species
0,39.1,18.7,181.0,3750.0,Adelie Penguin (Pygoscelis adeliae)
1,39.5,17.4,186.0,3800.0,Adelie Penguin (Pygoscelis adeliae)
2,40.3,18.0,195.0,3250.0,Adelie Penguin (Pygoscelis adeliae)
3,,,,,Adelie Penguin (Pygoscelis adeliae)
4,36.7,19.3,193.0,3450.0,Adelie Penguin (Pygoscelis adeliae)
...,...,...,...,...,...
339,55.8,19.8,207.0,4000.0,Chinstrap penguin (Pygoscelis antarctica)
340,43.5,18.1,202.0,3400.0,Chinstrap penguin (Pygoscelis antarctica)
341,49.6,18.2,193.0,3775.0,Chinstrap penguin (Pygoscelis antarctica)
342,50.8,19.0,210.0,4100.0,Chinstrap penguin (Pygoscelis antarctica)


# Step2：数据探索

In [10]:
# 查看数据整体情况
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 5 columns):
Culmen Length (mm)     342 non-null float64
Culmen Depth (mm)      342 non-null float64
Flipper Length (mm)    342 non-null float64
Body Mass (g)          342 non-null float64
Species                344 non-null object
dtypes: float64(4), object(1)
memory usage: 13.6+ KB


Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Species
0,39.1,18.7,181.0,3750.0,Adelie Penguin (Pygoscelis adeliae)
1,39.5,17.4,186.0,3800.0,Adelie Penguin (Pygoscelis adeliae)
2,40.3,18.0,195.0,3250.0,Adelie Penguin (Pygoscelis adeliae)
3,,,,,Adelie Penguin (Pygoscelis adeliae)
4,36.7,19.3,193.0,3450.0,Adelie Penguin (Pygoscelis adeliae)


In [12]:
# 缺失情况统计
miss_stat = data.isnull().sum()/data.isnull().count()
miss_stat

Culmen Length (mm)     0.005814
Culmen Depth (mm)      0.005814
Flipper Length (mm)    0.005814
Body Mass (g)          0.005814
Species                0.000000
dtype: float64

In [13]:
# 样本分布情况
pd.Series(data['Species']).value_counts()

Adelie Penguin (Pygoscelis adeliae)          152
Gentoo penguin (Pygoscelis papua)            124
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

# Step3：数据预处理

In [15]:
# 缺失值填充
data = data.fillna(-1)

In [20]:
# 类别特征编码
numerical_features = [x for x in data.columns if data[x].dtype == np.float]
category_features = [x for x in data.columns if data[x].dtype != np.float and x != 'RainTomorrow']
def category_encode(x):
    mapp = dict(zip(x.unique().tolist(),range(len(x.unique().tolist()))))
    
    def map_function(y):
        if y in mapp:
            return mapp[y]
        else:
            return -1
    return map_function

for x in category_features:
    data[x] = data[x].apply(category_encode(data[x]))
    
data

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Species
0,39.1,18.7,181.0,3750.0,0
1,39.5,17.4,186.0,3800.0,0
2,40.3,18.0,195.0,3250.0,0
3,-1.0,-1.0,-1.0,-1.0,0
4,36.7,19.3,193.0,3450.0,0
...,...,...,...,...,...
339,55.8,19.8,207.0,4000.0,2
340,43.5,18.1,202.0,3400.0,2
341,49.6,18.2,193.0,3775.0,2
342,50.8,19.0,210.0,4100.0,2


# Step4：数据建模

In [40]:
# 样本切分
data_feature = data[['Culmen Length (mm)','Culmen Depth (mm)','Flipper Length (mm)','Body Mass (g)']]
data_target = data['Species']
x_train,x_test,y_train,y_test = train_test_split(data_feature,data_target,test_size = 0.2,random_state = 2020)

# 模型训练
clf = DecisionTreeClassifier(criterion='gini')
clf = clf.fit(x_train,y_train)

# Step5：模型评估

In [41]:
y_predict = clf.predict(x_test)
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        32
           1       1.00      0.96      0.98        24
           2       0.87      1.00      0.93        13

    accuracy                           0.96        69
   macro avg       0.94      0.97      0.95        69
weighted avg       0.96      0.96      0.96        69



# Step6：模型优化

In [None]:
# 模型优化-参数调整
