In [23]:
import numpy as np
import pandas as pd

# 1 读取文件

- `pd.read_csv()` 读取 csv 文件
- `pd.read_excel()` 读取 excel 文件
- `pd.read_sql()` 读取 sql 文件
- `pd.read_json()` 读取 json 文件


In [24]:
iris = pd.read_csv("/Users/zhaohaonan/北大资料/Coding/MachineLearning/Datasets/Iris.csv")
print(iris.head(2))

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa


# 2 缺失值处理

In [36]:
age = pd.Series([ 20, 30], index=["B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000], index=["A", "B", "C"])
gender = pd.Series(["Male", "Female", "Male"], index=["A", "B", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN


## 2.1 查看缺失值
- `df.isnull()` 查看缺失值
- `df.isnull().any(axis=0/1)` 查看每列/行是否有缺失值
- `df.isnull().sum()` 查看每列缺失值数量
- `df.isnull().sum().sum()` 查看所有缺失值数量
- `df.info()` 查看每列缺失值数量

In [41]:
print(df.isnull().sum())
print("===========")
print(df.isnull().any())
print("===========")
print(df.isnull().any(axis=1))
print("===========")
print(df.info())

Age       2
Gender    1
Salary    1
dtype: int64
Age       True
Gender    True
Salary    True
dtype: bool
A     True
B    False
C     True
D     True
dtype: bool
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     2 non-null      float64
 1   Gender  3 non-null      object 
 2   Salary  3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 128.0+ bytes
None


## 2.2 删除缺失值
`df.dropna(axis=0/1, how='any/all', thresh=None, subset=None, inplace=True)`
- axis: 0-行操作（删除行），1-列操作（删除列）
- how: any-只要有缺失值出现就删除，all-全部缺失值才删除
- thresh: int-至少需要非 NaN 的数据个数
- subset: list-指定列
- inplace: bool-是否替换原数据


In [57]:
print(df)
print("======================")

# 删除至少一个缺失值的行
print("删除至少一个缺失值的行")
df1 = df.dropna(axis = 0, how = "any")
print(df1)
print("======================")

# 删除至少一个缺失值的列
print("删除至少一个缺失值的列")
df2 = df.dropna(axis = 1, how = "any")
print(df2)
print("======================")

# 删除所有值都缺失的行
print("删除所有值都缺失的行")
df3 = df.dropna(axis = 0, how = "all")
print(df3)
print("======================")

# 删除所有值都缺失的列
print("删除所有值都缺失的列")
df4 = df.dropna(axis = 1, how = "all")
print(df4)
print("======================")

# 删除指定列中缺失值的行
print("删除指定列中缺失值的行")
df5 = df.dropna(axis = 0, how = "any", subset=["Age", "Salary"])
print(df5)
print("======================")

# 指定最少非缺失值个数
print("指定最少非缺失值个数")
df7 = df.dropna(axis = 0, thresh=2)
print(df7)
print("======================")

    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN
删除至少一个缺失值的行
    Age  Gender  Salary
B  20.0  Female  2000.0
删除至少一个缺失值的列
Empty DataFrame
Columns: []
Index: [A, B, C, D]
删除所有值都缺失的行
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN
删除所有值都缺失的列
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN
删除指定列中缺失值的行
    Age  Gender  Salary
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
指定最少非缺失值个数
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
