# 清除示例 II
我们先练习癌症数据缺失值和重复值的处理。

In [1]:
import pandas as pd

# 读入 `cancer_data_edited.csv`
df = pd.read_csv('cancer_data_edited_i_saved.csv')

# 用 info() 检查哪些列有缺失值
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 11 columns):
id                     569 non-null int64
diagnosis              569 non-null object
radius_mean            569 non-null float64
texture_mean           548 non-null float64
perimeter_mean         569 non-null float64
area_mean              569 non-null float64
smoothness_mean        521 non-null float64
compactness_mean       569 non-null float64
concavity_mean         569 non-null float64
concave_points_mean    569 non-null float64
symmetry_mean          504 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 49.0+ KB
None


In [2]:
# 用均值填充缺失值
texture_mean = df["texture_mean"].mean()
df["texture_mean"].fillna(texture_mean, inplace=True)

smoothness_mean = df["smoothness_mean"].mean()
df["smoothness_mean"].fillna(smoothness_mean, inplace=True)

symmetry_mean = df["symmetry_mean"].mean()
df["symmetry_mean"].fillna(symmetry_mean, inplace=True)

# 用 info() 确认修改
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 11 columns):
id                     569 non-null int64
diagnosis              569 non-null object
radius_mean            569 non-null float64
texture_mean           569 non-null float64
perimeter_mean         569 non-null float64
area_mean              569 non-null float64
smoothness_mean        569 non-null float64
compactness_mean       569 non-null float64
concavity_mean         569 non-null float64
concave_points_mean    569 non-null float64
symmetry_mean          569 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 49.0+ KB
None


In [3]:
# 检查数据中的重复
print(sum(df.duplicated()))


5


In [4]:
# 丢弃重复
df.drop_duplicates(inplace=True)


In [5]:
# 再次检查数据中的重复，确认修改
print(sum(df.duplicated()))

0


## 重命名列
由于之前修改了数据集，使其仅包括肿瘤特征的均值，因此每个特征末尾好像不需要 "_mean" 。而且，稍后输入分析还要多耗费时间。我们现在想一些要分配给列的新标签。

In [6]:
# 从列名称中移除 "_mean"
new_labels = []
for col in df.columns:
    if '_mean' in col:
        new_labels.append(col[:-5])  # 不包括最后 6 个字符
    else:
        new_labels.append(col)

# 列的新标签
new_labels

['id',
 'diagnosis',
 'radius',
 'texture',
 'perimeter',
 'area',
 'smoothness',
 'compactness',
 'concavity',
 'concave_points',
 'symmetry']

In [7]:
# 为数据框中的列分配新标签
df.columns = new_labels

# 显示数据框的前几行，确认更改
df.head()

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry
0,842302,M,17.99,19.293431,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069
3,84348301,M,11.42,20.38,77.58,386.1,0.096087,0.2839,0.2414,0.1052,0.2597
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809


In [8]:
# 将其保存，供稍后使用
df.to_csv('cancer_data_edited.csv', index=False)