In [23]:
import numpy as np
import pandas as pd

# 1 读取文件

- `pd.read_csv("path/to/your/csv/file", header = 0, names = [], index_col = None, encoding = "utf-8")` 
    - 读取 csv 文件，默认第一行为列名，如果没有列名，可以通过 `names` 参数指定列名
    - 如果同时指定了 `header` 和 `names` ，则表中内容为 `header` 之后的所有行，但是表头的列名不再是 `header` ，而是由 `names` 覆盖
    - `index_col` 指定某一列为索引列
- `pd.read_excel()` 读取 excel 文件
- `pd.read_sql()` 读取 sql 文件
- `pd.read_json()` 读取 json 文件


In [72]:
iris = pd.read_csv("/Users/zhaohaonan/北大资料/Coding/MachineLearning/Datasets/Iris.csv")
print(iris.head(2))
print("=========================================")

iris_1 = pd.read_csv("/Users/zhaohaonan/北大资料/Coding/MachineLearning/Datasets/Iris.csv", header=1)
print(iris_1.head(2))
print("=========================================")

iris_2 = pd.read_csv("/Users/zhaohaonan/北大资料/Coding/MachineLearning/Datasets/Iris.csv", names=["a", "b", "c", "d", "e"])
print(iris_2.head(2))
print("=========================================")

iris_3 = pd.read_csv("/Users/zhaohaonan/北大资料/Coding/MachineLearning/Datasets/Iris.csv", index_col="Species")
print(iris_3.head(2))
print("=========================================")

iris_4 = pd.read_csv("/Users/zhaohaonan/北大资料/Coding/MachineLearning/Datasets/Iris.csv", header = 1, names=["a", "b", "c", "d", "e"])
print(iris_4.head(2))


   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
   1  5.1  3.5  1.4  0.2  Iris-setosa
0  2  4.9  3.0  1.4  0.2  Iris-setosa
1  3  4.7  3.2  1.3  0.2  Iris-setosa
                a             b              c             d            e
Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
1             5.1           3.5            1.4           0.2  Iris-setosa
             Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
Species                                                                  
Iris-setosa   1            5.1           3.5            1.4           0.2
Iris-setosa   2            4.9           3.0            1.4           0.2
     a    b    c    d            e
2  4.9  3.0  1.4  0.2  Iris-setosa
3  4.7  3.2  1.3  0.2  Iris-setosa


## 读取时间序列相关文件

In [89]:
df = pd.read_csv("/Users/zhaohaonan/北大资料/Coding/MachineLearning/Datasets/birthday.csv",index_col="Birthday")
print(df)
print("=========================================")

# 将日期格式转换为 pandas 的日期格式
print("转换日期格式")
df.index = pd.to_datetime(df.index)
print(df)
print("=========================================")

# 然后就可以直接对日期进行排序和索引了
print("日期升序排序")
df.sort_index(axis=0, inplace=True)
print(df)
print("=========================================")

print("日期索引")
print(df.loc['2001-09'])

           Name  Class
Birthday              
07/09/2011    A      2
09/12/2001    B      3
09/11/1998    C      3
09/06/2001    D      4
04/08/1999    E      1
转换日期格式
           Name  Class
Birthday              
2011-07-09    A      2
2001-09-12    B      3
1998-09-11    C      3
2001-09-06    D      4
1999-04-08    E      1
日期升序排序
           Name  Class
Birthday              
1998-09-11    C      3
1999-04-08    E      1
2001-09-06    D      4
2001-09-12    B      3
2011-07-09    A      2
日期索引
           Name  Class
Birthday              
2001-09-06    D      4
2001-09-12    B      3


# 2 缺失值处理

In [36]:
age = pd.Series([ 20, 30], index=["B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000], index=["A", "B", "C"])
gender = pd.Series(["Male", "Female", "Male"], index=["A", "B", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN


## 2.1 查看缺失值
- `df.isnull()` 查看缺失值
- `df.isnull().any(axis=0/1)` 查看每列/行是否有缺失值
- `df.isnull().sum()` 查看每列缺失值数量
- `df.isnull().sum().sum()` 查看所有缺失值数量
- `df.info()` 查看每列缺失值数量

In [41]:
print(df.isnull().sum())
print("===========")
print(df.isnull().any())
print("===========")
print(df.isnull().any(axis=1))
print("===========")
print(df.info())

Age       2
Gender    1
Salary    1
dtype: int64
Age       True
Gender    True
Salary    True
dtype: bool
A     True
B    False
C     True
D     True
dtype: bool
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     2 non-null      float64
 1   Gender  3 non-null      object 
 2   Salary  3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 128.0+ bytes
None


## 2.2 删除缺失值
`df.dropna(axis=0/1, how='any/all', thresh=None, subset=None, inplace=True)`
- axis: 0-行操作（删除行），1-列操作（删除列）
- how: any-只要有缺失值出现就删除，all-全部缺失值才删除
- thresh: int-至少需要非 NaN 的数据个数
- subset: list-指定列
- inplace: bool-是否替换原数据


In [57]:
print(df)
print("======================")

# 删除至少一个缺失值的行
print("删除至少一个缺失值的行")
df1 = df.dropna(axis = 0, how = "any")
print(df1)
print("======================")

# 删除至少一个缺失值的列
print("删除至少一个缺失值的列")
df2 = df.dropna(axis = 1, how = "any")
print(df2)
print("======================")

# 删除所有值都缺失的行
print("删除所有值都缺失的行")
df3 = df.dropna(axis = 0, how = "all")
print(df3)
print("======================")

# 删除所有值都缺失的列
print("删除所有值都缺失的列")
df4 = df.dropna(axis = 1, how = "all")
print(df4)
print("======================")

# 删除指定列中缺失值的行
print("删除指定列中缺失值的行")
df5 = df.dropna(axis = 0, how = "any", subset=["Age", "Salary"])
print(df5)
print("======================")

# 指定最少非缺失值个数
print("指定最少非缺失值个数")
df7 = df.dropna(axis = 0, thresh=2)
print(df7)
print("======================")

    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN
删除至少一个缺失值的行
    Age  Gender  Salary
B  20.0  Female  2000.0
删除至少一个缺失值的列
Empty DataFrame
Columns: []
Index: [A, B, C, D]
删除所有值都缺失的行
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN
删除所有值都缺失的列
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN
删除指定列中缺失值的行
    Age  Gender  Salary
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
指定最少非缺失值个数
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0


## 2.3 填充缺失值
`df.fillna(value=None, axis=None, inplace=False, limit=None, downcast=None)`
- value: 填充值
- axis: 0-按行操作，1-按列操作
- inplace: bool-是否替换原数据
- limit: int-填充个数限制
- downcast: dict-指定列的填充方式


向前向后填充
- `df.ffill(axis=0/1)`
- `df.bfill(axis=0/1)`

In [67]:
print(df)
print("======================")

# 按列向前填充
print("向前填充")
df8 = df.ffill(axis = 0)
print(df8)
print("======================")

# 按行向后填充
print("向后填充")
df9 = df.bfill(axis = 1)
print(df9)
print("======================")

# Age 列的均值填充
print("Age 列的均值填充")
df10 = df.fillna(value = {"Age": df["Age"].mean()})
print(df10)
print("======================")

# Salary 列的中位数填充
print("Salary 列的中位数填充")
df11 = df.fillna(value = {"Salary": df["Salary"].median()})
print(df11)
print("======================")

# Gender 列的众数填充
print("Gender 列的众数填充")
df12 = df.fillna(value = {"Gender" : df["Gender"].mode()[0]})
print(df12)
print("======================")

    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male     NaN
向前填充
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0  Female  3000.0
D  30.0    Male  3000.0
向后填充
    Age  Gender  Salary
A  Male    Male  1000.0
B  20.0  Female  2000.0
C  30.0  3000.0  3000.0
D  Male    Male     NaN
Age 列的均值填充
    Age  Gender  Salary
A  25.0    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D  25.0    Male     NaN
Salary 列的中位数填充
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0     NaN  3000.0
D   NaN    Male  2000.0
Gender 列的众数填充
    Age  Gender  Salary
A   NaN    Male  1000.0
B  20.0  Female  2000.0
C  30.0    Male  3000.0
D   NaN    Male     NaN
