In [1]:
import pandas as pd

df = pd.read_csv("penguins.csv")

In [2]:
df.shape

(344, 8)

In [28]:
df.tail()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009
343,Chinstrap,Dream,50.2,18.7,198.0,3775.0,female,2009


In [4]:
# 删除 NaN 的数据记录

df.dropna(inplace=True)

In [5]:
df.shape

(333, 8)

In [6]:
# 获取数据的标签和特征

labels = df['species']

labels.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [7]:
# 将标签转换为数值类型

targets, uniques = pd.factorize(labels)

In [8]:
targets[:3] 

array([0, 0, 0])

In [9]:
uniques # 不同的标签值

Index(['Adelie', 'Gentoo', 'Chinstrap'], dtype='object')

In [10]:
# 排除了字段：species 和 year

features = df[df.columns[1:-1].values[:].tolist()]

features.shape

(333, 6)

In [11]:
features.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,male
1,Torgersen,39.5,17.4,186.0,3800.0,female
2,Torgersen,40.3,18.0,195.0,3250.0,female
4,Torgersen,36.7,19.3,193.0,3450.0,female
5,Torgersen,39.3,20.6,190.0,3650.0,male


In [12]:
# 特征 one-hot 向量化

features_vector = pd.get_dummies(features)

In [13]:
features_vector.shape

(333, 9)

In [14]:
features_vector.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,40.3,18.0,195.0,3250.0,0,0,1,1,0
4,36.7,19.3,193.0,3450.0,0,0,1,1,0
5,39.3,20.6,190.0,3650.0,0,0,1,0,1


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# 数据集切分为：训练集 80% 和 测试集 20%

x_train, x_test, y_train, y_test = train_test_split(features_vector, targets, test_size=0.2)

In [16]:
x_train.shape

(266, 9)

In [17]:
x_test.shape

(67, 9)

In [18]:
# 模型：随机森林

model = RandomForestClassifier(random_state=666)

# 模型训练

model.fit(x_train, y_train)

In [19]:
# 模型预测

y_pred = model.predict(x_test)

In [20]:
# 计算 accuracy

score = accuracy_score(y_pred, y_test)

In [21]:
score

0.9850746268656716

In [24]:
# 保存模型

import pickle

with open("rfc_model.pickle", "wb") as file:
    pickle.dump(model, file)

In [23]:
# 保存标签名称

with open("label_names.pickle", "wb") as file:
    pickle.dump(uniques, file)