### 参考网址： https://www.runoob.com/pandas/pandas-series.html

In [399]:
import pandas as pd

x = [i for i in range(10)]
s = pd.Series(x, index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
print(s)
print("---" * 8)
print(s[1:5])
print("---" * 8)
print(s['b':'f'])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
h    7
i    8
j    9
dtype: int64
------------------------
b    1
c    2
d    3
e    4
dtype: int64
------------------------
b    1
c    2
d    3
e    4
f    5
dtype: int64


In [400]:
s['a'] = 6666
del s['b']
s['what'] = 8888
print(s)

a       6666
c          2
d          3
e          4
f          5
g          6
h          7
i          8
j          9
what    8888
dtype: int64


In [401]:
s_dropped = s.drop(['what'])  # 返回一个删除了索引标签 'b' 的新 Series
print(s_dropped)
print("---" * 8)
print(s)

a    6666
c       2
d       3
e       4
f       5
g       6
h       7
i       8
j       9
dtype: int64
------------------------
a       6666
c          2
d          3
e          4
f          5
g          6
h          7
i          8
j          9
what    8888
dtype: int64


In [402]:
filtered_series = s[s % 2 == 0]  # 选择满足条件的元素
filtered_series

a       6666
c          2
e          4
g          6
i          8
what    8888
dtype: int64

In [403]:
print(s.index)
print(s.values)
print(s.size)
print(s.describe())

Index(['a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'what'], dtype='object')
[6666    2    3    4    5    6    7    8    9 8888]
10
count      10.000000
mean     1559.800000
std      3318.343355
min         2.000000
25%         4.250000
50%         6.500000
75%         8.750000
max      8888.000000
dtype: float64


In [404]:
d = s % 2 == 0
print(s)
print("---" * 8)
print(d)

a       6666
c          2
d          3
e          4
f          5
g          6
h          7
i          8
j          9
what    8888
dtype: int64
------------------------
a        True
c        True
d       False
e        True
f       False
g        True
h       False
i        True
j       False
what     True
dtype: bool


## ------------------------------------------------------------------------

### dataframe可视为一系列 series列 组成的集合，因此df['c1']取出的是一列数据

In [405]:
import pandas as pd

data = [['a', 11], 
        ['b', 22], 
        ['c', 33]]
df = pd.DataFrame(data, columns=['column_1', 'column_2'], index=['index_1', 'index_2', 'index_3']) # 创建DataFrame
print(df)
print("-" * 24)
df['column_2'] = df['column_2'].astype(float) # 使用astype方法设置每列的数据类型
print(df)

        column_1  column_2
index_1        a        11
index_2        b        22
index_3        c        33
------------------------
        column_1  column_2
index_1        a      11.0
index_2        b      22.0
index_3        c      33.0


### 使用字典+值数组 逐列创建, 不支持自动广播，各数组的长度必须一致

In [406]:
import pandas as pd
data_2 = {'column_1':['a', 'b', 'c'], 'column_2':[11, 22, 33]}
df = pd.DataFrame(data_2)
print(df)

  column_1  column_2
0        a        11
1        b        22
2        c        33


### 使用数组+字典 逐行创建，支持自动广播，各字典长度可不一致

In [407]:
import pandas as pd
data_3 = [{'column_1': 'a'},{'column_1': 'b', 'column_2': 22}]
df_3 = pd.DataFrame(data_3)
print(df_3)

  column_1  column_2
0        a       NaN
1        b      22.0


### loc属性返回指定 行 的数据，如果没有设置索引，第一行索引为0，否则不可使用0索引第一行

In [408]:
df = pd.DataFrame(data, columns=['column_1', 'column_2'], index=['index_1', 'index_2', 'index_3']) # 创建DataFrame
print(df)
print("-"*24)
print(type(df.loc['index_1']))
print("-"*24)
print(df.loc['index_1']) #取出一行时，为Series类型
print("-*"*12)
print(type(df.loc[['index_2','index_3']])) #取出多行时，为DataFrame类型
df.loc[['index_2','index_3']]

        column_1  column_2
index_1        a        11
index_2        b        22
index_3        c        33
------------------------
<class 'pandas.core.series.Series'>
------------------------
column_1     a
column_2    11
Name: index_1, dtype: object
-*-*-*-*-*-*-*-*-*-*-*-*
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,column_1,column_2
index_2,b,22
index_3,c,33


In [409]:
print(df.loc[df.column_2 > 12,'column_1':'column_2']) #取出某一列满足某条件的dataframe, 可以写两个条件
print("-*"*12)
print(df.loc[df.column_2 > 24,['column_1','column_2']])

        column_1  column_2
index_2        b        22
index_3        c        33
-*-*-*-*-*-*-*-*-*-*-*-*
        column_1  column_2
index_3        c        33


In [410]:
print(df_3.head())    # 前几行数据，默认是前 5 行
print("-*"*12)
print(df_3.shape)     # 形状
print("-*"*12)
print(df_3.info())    # 数据信息
print("-*"*12)
print(df_3.describe())# 描述统计信息

  column_1  column_2
0        a       NaN
1        b      22.0
-*-*-*-*-*-*-*-*-*-*-*-*
(2, 2)
-*-*-*-*-*-*-*-*-*-*-*-*
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   column_1  2 non-null      object 
 1   column_2  1 non-null      float64
dtypes: float64(1), object(1)
memory usage: 160.0+ bytes
None
-*-*-*-*-*-*-*-*-*-*-*-*
       column_2
count       1.0
mean       22.0
std         NaN
min        22.0
25%        22.0
50%        22.0
75%        22.0
max        22.0


### 使用 列名 作为属性或通过 .loc[]、.iloc[] 访问，也可以使用标签或位置索引

#### DataFrame.loc[行标签,列标签]
#### DataFrame.iloc[行位置,列位置] (index location)

In [411]:
print(df)
print("--"*12)
print(df['column_1']) # 通过列名访问
print("--"*12)
print(df.column_1) # 通过属性访问
print("--"*12)
print(df.loc[:, 'column_1']) # 通过 .loc[] 访问, 相当于选中所有 行 和给定的 列
print("-*"*12)
print(df.iloc[:, 1]) # 通过 .iloc[] 访问
print("--"*12)
print(df['column_1'][0]) # 访问单个元素
print(df['column_1']['index_1']) # 访问单个元素
print(df.loc['index_1','column_1'])

        column_1  column_2
index_1        a        11
index_2        b        22
index_3        c        33
------------------------
index_1    a
index_2    b
index_3    c
Name: column_1, dtype: object
------------------------
index_1    a
index_2    b
index_3    c
Name: column_1, dtype: object
------------------------
index_1    a
index_2    b
index_3    c
Name: column_1, dtype: object
-*-*-*-*-*-*-*-*-*-*-*-*
index_1    11
index_2    22
index_3    33
Name: column_2, dtype: int64
------------------------
a
a
a


### 数据添加和修改

In [412]:
print(df)

        column_1  column_2
index_1        a        11
index_2        b        22
index_3        c        33


In [413]:
df['column_3'] = [44, 55, 66] # 添加列
print(df)

        column_1  column_2  column_3
index_1        a        11        44
index_2        b        22        55
index_3        c        33        66


In [414]:
df.loc['index_4'] = ['d', 77, 88] # 添加行
print(df)

        column_1  column_2  column_3
index_1        a        11        44
index_2        b        22        55
index_3        c        33        66
index_4        d        77        88


### 合并dataframe
#### concat: 纵向合并
#### merge: 横向合并，可选择内连接, 左连接, 右连接, 外连接

In [415]:
df2 = pd.DataFrame([['e', 99, 100], ['f', 111, 122]], columns=['column_1', 'column_2', 'column_3'], index=['index_5', 'index_6'])
print(df2)
print("*-"*12)
df3 = pd.DataFrame([['g',222, 133]], columns=['column_1', 'column_2', 'column_3'], index=['index_7'])
df = pd.concat([df, df2, df3])  # 将新行添加到原始DataFrame
print(df)

        column_1  column_2  column_3
index_5        e        99       100
index_6        f       111       122
*-*-*-*-*-*-*-*-*-*-*-*-
        column_1  column_2  column_3
index_1        a        11        44
index_2        b        22        55
index_3        c        33        66
index_4        d        77        88
index_5        e        99       100
index_6        f       111       122
index_7        g       222       133


In [416]:
df44 = pd.DataFrame([[44, 1212], [66, 3434], [88, 5656]], columns=['column_3', 'column_4'], index=['index_1', 'index_2', 'index_3'])
print(df)
print(df44)
print("-*"*12)
df55 = pd.merge(df,df44)
print(df55)

        column_1  column_2  column_3
index_1        a        11        44
index_2        b        22        55
index_3        c        33        66
index_4        d        77        88
index_5        e        99       100
index_6        f       111       122
index_7        g       222       133
         column_3  column_4
index_1        44      1212
index_2        66      3434
index_3        88      5656
-*-*-*-*-*-*-*-*-*-*-*-*
  column_1  column_2  column_3  column_4
0        a        11        44      1212
1        c        33        66      3434
2        d        77        88      5656


In [417]:
df55.corr() # 相关系数, 会自动计算数值型数据列之间的相关系数，默认是皮尔逊相关系数

Unnamed: 0,column_2,column_3,column_4
column_2,1.0,0.981981,0.981981
column_3,0.981981,1.0,1.0
column_4,0.981981,1.0,1.0


In [418]:
df55.corr(method='spearman') 

Unnamed: 0,column_2,column_3,column_4
column_2,1.0,1.0,1.0
column_3,1.0,1.0,1.0
column_4,1.0,1.0,1.0


### 删除 列或行
#### axis:删除时所参考的轴，0为行，1为列
#### inplace:是否在原数据上操作,默认是复制一份，不在原数据上操作

In [419]:
df_dropped = df.drop('column_3',axis=1)
print(df_dropped)

        column_1  column_2
index_1        a        11
index_2        b        22
index_3        c        33
index_4        d        77
index_5        e        99
index_6        f       111
index_7        g       222


In [420]:
df_dropped = df_dropped.drop('index_7')  # 删除索引为 index_7 的行, 默认axis=0
print(df_dropped)

        column_1  column_2
index_1        a        11
index_2        b        22
index_3        c        33
index_4        d        77
index_5        e        99
index_6        f       111
