# 企鹅数据分析
1. 导入必要的库
2. 导入数据
3. 数据清洗
4. 数据特征的构造
5. 数据分析

In [3]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('../data/penguins.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [17]:
# 3. 数据检查
# 检查缺失值
print(df.isna().sum())
# 删除缺失值
df.dropna(inplace=True)
df

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [18]:
# 再次检查缺失值是否存在
df.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### 数据特征的构造
- 将性别类型转换为category
- 构造一个新的数据特征bill_ratio喙的长宽比

In [20]:
df['sex'] = df['sex'].astype('category')
df['bill_ratio'] = df['bill_length_mm'] / df['bill_depth_mm']
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,bill_ratio
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,2.090909
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,2.270115
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,2.238889
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1.901554
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,1.907767
...,...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female,3.445255
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,3.272727
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,3.210191
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,3.054054


### 5. 数据分析
- 数据分箱，把体重分为低中高三个等级
- 按照岛屿性别对数据进行分组

In [26]:
labels = ['低', '中', '高']
df['mass_level'] = pd.cut(df['body_mass_g'], bins=3, labels=labels)
print(df['mass_level'].value_counts())

df.groupby(['sex', 'island']).agg({
    'body_mass_g': ['mean', 'count', 'min', 'max']
})


mass_level
低    150
中    128
高     55
Name: count, dtype: int64


  df.groupby(['sex', 'island']).agg({


Unnamed: 0_level_0,Unnamed: 1_level_0,body_mass_g,body_mass_g,body_mass_g,body_mass_g
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,min,max
sex,island,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,Biscoe,4319.375,80,2850.0,5200.0
Female,Dream,3446.311475,61,2700.0,4150.0
Female,Torgersen,3395.833333,24,2900.0,3800.0
Male,Biscoe,5104.518072,83,3550.0,6300.0
Male,Dream,3987.096774,62,3250.0,4800.0
Male,Torgersen,4034.782609,23,3325.0,4700.0
