# tips:每一步操作都不会改变dataframe本身的结构和值

In [1]:
import pandas as pd
import numpy as np

# 处理CSV文件

In [7]:
train_Data = pd.read_csv("train.csv")
train_Data.head(10)          # 打印前10行信息

Unnamed: 0,image_id,label,variety,age
0,100330.jpg,bacterial_leaf_blight,ADT45,45
1,100365.jpg,bacterial_leaf_blight,ADT45,45
2,100382.jpg,bacterial_leaf_blight,ADT45,45
3,100632.jpg,bacterial_leaf_blight,ADT45,45
4,101918.jpg,bacterial_leaf_blight,ADT45,45
5,102353.jpg,bacterial_leaf_blight,ADT45,45
6,102848.jpg,bacterial_leaf_blight,ADT45,45
7,103051.jpg,bacterial_leaf_blight,ADT45,45
8,103702.jpg,bacterial_leaf_blight,ADT45,45
9,103920.jpg,bacterial_leaf_blight,ADT45,45


In [9]:
train_Data.info()     # 10407条训练数据，且没有缺失值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10407 entries, 0 to 10406
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   image_id  10407 non-null  object
 1   label     10407 non-null  object
 2   variety   10407 non-null  object
 3   age       10407 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 325.3+ KB


In [55]:
import random   # 通常训练集的数据都需要随机打乱，shuffle的方式有很多，这里采用python自带random
random.seed(100)         # 固定随机种子，伪随机，不然每次运行index都不一样
index = [i for i in train_Data.index]    # 把数据集index全部取出来然后打乱
random.shuffle(index)
print(index[:10])
length = len(index)           # 假如训练和验证集7：3
train_len = int(length*0.7)

train_df = train_Data.iloc[index[0:train_len]]
valid_df = train_Data.iloc[index[train_len:]]
print(len(train_df))
print(len(valid_df))     # 可以看到已经分离出来，且结果正确

[5708, 1336, 412, 5830, 7723, 6266, 8293, 1041, 7375, 10077]
7284
3123


# pandas dataframe基础

In [2]:
df = pd.DataFrame({'A':1.,
           'B':pd.Timestamp('20220105'),
           'C':pd.Series(1,index=list(range(4)),dtype='float32'),
           'D':np.array([3]*4,dtype='int32'),
           'E':pd.Categorical(["test", "train", "test", "train"]),
           'F':'foo'})
# 这里我们构造了一个pandas下的dataframe数据结构，其中有些列指定了属性类型

In [3]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2022-01-05,1.0,3,test,foo
1,1.0,2022-01-05,1.0,3,train,foo
2,1.0,2022-01-05,1.0,3,test,foo
3,1.0,2022-01-05,1.0,3,train,foo


In [4]:
list1 = ['A']
list2 = ['B']
df1 = df[list1+list2].copy()
df1

Unnamed: 0,A,B
0,1.0,2022-01-05
1,1.0,2022-01-05
2,1.0,2022-01-05
3,1.0,2022-01-05


In [5]:
# dataframe中有很多属性，我们可以直接查看
df.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [6]:
df.index   # 返回所有列的序号

Int64Index([0, 1, 2, 3], dtype='int64')

In [7]:
df.columns   # 打印每一列名称

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [8]:
df.values   # 打印列下面所有的值

array([[1.0, Timestamp('2022-01-05 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2022-01-05 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2022-01-05 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2022-01-05 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [9]:
df.describe()   # 描述数据集的一些统计特征

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [10]:
df.T     # 类似矩阵转置，翻转一下

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2022-01-05 00:00:00,2022-01-05 00:00:00,2022-01-05 00:00:00,2022-01-05 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [11]:
df.sort_index(axis=0, ascending=False)   # 按index索引排序，axis=0表示按行序号大小排序，False则降序，True则升序
# axis=1则是按列名称排序

Unnamed: 0,A,B,C,D,E,F
3,1.0,2022-01-05,1.0,3,train,foo
2,1.0,2022-01-05,1.0,3,test,foo
1,1.0,2022-01-05,1.0,3,train,foo
0,1.0,2022-01-05,1.0,3,test,foo


In [12]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2022-01-05,1.0
1,foo,train,3,1.0,2022-01-05,1.0
2,foo,test,3,1.0,2022-01-05,1.0
3,foo,train,3,1.0,2022-01-05,1.0


In [13]:
# 类似的可以按值排序
df.sort_values(by='E')   # 按'E'这一列的值排序，sort_values里面还有很多参数可以指定

Unnamed: 0,A,B,C,D,E,F
0,1.0,2022-01-05,1.0,3,test,foo
2,1.0,2022-01-05,1.0,3,test,foo
1,1.0,2022-01-05,1.0,3,train,foo
3,1.0,2022-01-05,1.0,3,train,foo


# 数据选取

In [14]:
dates = pd.date_range('20220105', periods=6)
df2 = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A','B','C','D'])

In [15]:
print(df2['A'], df2.A)   # 两种打印方式，输出结果是一样的

2022-01-05     0
2022-01-06     4
2022-01-07     8
2022-01-08    12
2022-01-09    16
2022-01-10    20
Freq: D, Name: A, dtype: int64 2022-01-05     0
2022-01-06     4
2022-01-07     8
2022-01-08    12
2022-01-09    16
2022-01-10    20
Freq: D, Name: A, dtype: int64


In [16]:
print(df2[0:3], df2['20220105':'20220107'])   # 两种切片访问方式

            A  B   C   D
2022-01-05  0  1   2   3
2022-01-06  4  5   6   7
2022-01-07  8  9  10  11             A  B   C   D
2022-01-05  0  1   2   3
2022-01-06  4  5   6   7
2022-01-07  8  9  10  11


In [17]:
# 根据标签选择
print(df2.loc['20220105'])  # 选择该标签下的所有值列出

A    0
B    1
C    2
D    3
Name: 2022-01-05 00:00:00, dtype: int64


In [18]:
print(df2.loc[:,['A','B']])   # 保留所有行的数据，然后把指定标签下列数据打印出来，也可以单独指定某一行

             A   B
2022-01-05   0   1
2022-01-06   4   5
2022-01-07   8   9
2022-01-08  12  13
2022-01-09  16  17
2022-01-10  20  21


In [19]:
print(df2.loc['20220105',['A','B']])

A    0
B    1
Name: 2022-01-05 00:00:00, dtype: int64


In [50]:
# 根据index选择数据，上面的loc是根据label的方式
print(df2.iloc[2].values)   # 第三行列出，只保留取值不要index索引

[1. 1. 1. 1.]


In [21]:
print(df2.iloc[2,0:2])    # 同样也可以切片

A    8
B    9
Name: 2022-01-07 00:00:00, dtype: int64


In [22]:
print(df2.iloc[[0,2,4],0:2])   # 还可以单独拎出来几行 

             A   B
2022-01-05   0   1
2022-01-07   8   9
2022-01-09  16  17


In [23]:
# 还可以将label和index结合在一起筛选
#print(df2.ix[:3, ['A','C']])   # 最新版pandas库中已经删除这个方法了，不用就完事了

In [24]:
print(df2[df2.A>8])   # 把指定列的值按条件筛选出一个新的dataframe

             A   B   C   D
2022-01-08  12  13  14  15
2022-01-09  16  17  18  19
2022-01-10  20  21  22  23


# 如何设置自己指定的值

In [25]:
df2.iloc[2,2] = 111
df2.loc['20220106','B'] = 3
df2.loc[:,'A'][df2.A>4] = 0  # 或者df2.A[df2.A>4] = 0把对应筛选的那一列值全置为0
df2['F'] = np.nan   # 增加一列全为空值，none
df2['E'] = pd.Series([1,2,3,4,5,6], index = pd.date_range('20220105', periods=6))  # 这里添加的时候index一定要一致，否则显然无法添加
print(df2)

            A   B    C   D   F  E
2022-01-05  0   1    2   3 NaN  1
2022-01-06  4   3    6   7 NaN  2
2022-01-07  0   9  111  11 NaN  3
2022-01-08  0  13   14  15 NaN  4
2022-01-09  0  17   18  19 NaN  5
2022-01-10  0  21   22  23 NaN  6


# 处理丢失数据（缺失值）

In [26]:
df2.iloc[0,2] = np.nan

In [27]:
print(df2)

            A   B      C   D   F  E
2022-01-05  0   1    NaN   3 NaN  1
2022-01-06  4   3    6.0   7 NaN  2
2022-01-07  0   9  111.0  11 NaN  3
2022-01-08  0  13   14.0  15 NaN  4
2022-01-09  0  17   18.0  19 NaN  5
2022-01-10  0  21   22.0  23 NaN  6


In [28]:
# 第一种方式，直接丢掉缺失数据所在的行或者列
print(df2.dropna(axis=1, how='any'))  # 任意列包含缺失值就直接舍弃，如果how='all'则要求所有值都是none才丢掉

            A   B   D  E
2022-01-05  0   1   3  1
2022-01-06  4   3   7  2
2022-01-07  0   9  11  3
2022-01-08  0  13  15  4
2022-01-09  0  17  19  5
2022-01-10  0  21  23  6


In [29]:
# 第二种方式，填充值
print(df2.fillna(value=0))

            A   B      C   D    F  E
2022-01-05  0   1    0.0   3  0.0  1
2022-01-06  4   3    6.0   7  0.0  2
2022-01-07  0   9  111.0  11  0.0  3
2022-01-08  0  13   14.0  15  0.0  4
2022-01-09  0  17   18.0  19  0.0  5
2022-01-10  0  21   22.0  23  0.0  6


In [30]:
# 判断是否由缺失值
print(df2.isnull().astype('int32'))
print(np.any(df2.isnull()) == True)  # 至少有一个丢失都会返回True

            A  B  C  D  F  E
2022-01-05  0  0  1  0  1  0
2022-01-06  0  0  0  0  1  0
2022-01-07  0  0  0  0  1  0
2022-01-08  0  0  0  0  1  0
2022-01-09  0  0  0  0  1  0
2022-01-10  0  0  0  0  1  0
True


# pandas合并concat

In [31]:
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])

res = pd.concat([df1,df2,df3], axis=0, ignore_index=True)  # 从上到下合并
print(res)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0


In [32]:
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res1 = pd.concat([df1,df2,df3], join='inner', ignore_index=True)
res2 = pd.concat([df1,df2,df3], join='outer', ignore_index=True)   # 两种join参数区别
print(res1)
print(res2)

     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0
6  2.0  2.0  2.0
7  2.0  2.0  2.0
8  2.0  2.0  2.0
     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
5  NaN  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0  NaN
7  2.0  2.0  2.0  2.0  NaN
8  2.0  2.0  2.0  2.0  NaN


In [33]:
# res3 = pd.concat([df1,df2], axis=1, jion_axes=[df1.index]) 新的版本中已经删除了jion_axes这个参数，用来指定用哪一个dataframe的index来做合并
# print(res3)

In [34]:
# 利用append添加合并
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res3 = df1.append(s1, ignore_index=True)
res4 = df1.append(df2, ignore_index=True)  # 也可以两个以上dataframe添加
print(res3)
print(res4)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0
     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
5  NaN  1.0  1.0  1.0  1.0


# pandas合并merge

In [35]:
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
            'key2':['K0','K1','K0','K1'],         
            'A':['A0','A1','A2','A3'],
            'B':['B0','B1','B2','B3']})

right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
            'key2':['K0','K0','K0','K0'],
            'C':['C0','C1','C2','C3'],
            'D':['D0','D1','D2','D3']})
print(left)
print(right)

  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3


In [36]:
res = pd.merge(left, right, on='key1')  # 基于某一个label或者column的合并，默认的how方法是'inner'
print(res)

  key1 key2_x   A   B key2_y   C   D
0   K0     K0  A0  B0     K0  C0  D0
1   K0     K1  A1  B1     K0  C0  D0
2   K1     K0  A2  B2     K0  C1  D1
3   K1     K0  A2  B2     K0  C2  D2
4   K2     K1  A3  B3     K0  C3  D3


In [37]:
res1 = pd.merge(left, right, on=['key1', 'key2'])
print(res1)

  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2


In [38]:
# how = ['inner','outer','left','right']
res2 = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res2)

  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3


In [39]:
col_left = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
col_right = pd.DataFrame({'col1':[1,2,2], 'col_right':[2,2,2]})
print(col_left)
print(col_right)

   col1 col_left
0     0        a
1     1        b
   col1  col_right
0     1          2
1     2          2
2     2          2


In [40]:
res3 = pd.merge(col_left, col_right, on='col1', how='outer', indicator=True)   # indicator表示显示合并方式的名称
print(res3)

   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only


In [41]:
# 按index合并
res4 = pd.merge(col_left, col_right, left_index=True, right_index=True, how='outer')
print(res4)

   col1_x col_left  col1_y  col_right
0     0.0        a       1          2
1     1.0        b       2          2
2     NaN      NaN       2          2


## pandas处理独热编码one-hot

In [42]:
s1 = ['a', 'b', np.nan]

In [43]:
pd.get_dummies(s1)   # 自动忽略缺失值

Unnamed: 0,a,b
0,1,0
1,0,1
2,0,0


In [44]:
pd.get_dummies(s1, dummy_na=True)    # 把缺失值也进行独热

Unnamed: 0,a,b,NaN
0,1,0,0
1,0,1,0
2,0,0,1


# 使用另一dataframe作为当前dataframe的筛选条件

In [45]:
import numpy as np
import pandas as pd
df = pd.DataFrame({ 'A' : [1,2,3,4],
                     'B' : [4,5,6,7]
                  })  
a = pd.Series([1,2,3,1])

# 对series进行筛选
(a==1).sum()

2

In [46]:
# 对dataframe进行筛选
df[a==1].sum(0)

A     5
B    11
dtype: int64

In [4]:
# 还可以读取json文件（csv这种就不说了，过于基础）
URL = 'https://static.runoob.com/download/sites.json'
df = pd.read_json(URL)
print(df)

     id    name             url  likes
0  A001    菜鸟教程  www.runoob.com     61
1  A002  Google  www.google.com    124
2  A003      淘宝  www.taobao.com     45
