## 生成数据表 

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    {
        "id": [1001, 1002, 1003, 1004, 1005, 1006],
        "date":
        pd.date_range('20130102', periods=6),
        "city":
        ['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
        "age": [23, 44, 54, 32, 34, 32],
        "category": ['100-A', '100-B', '110-A', '110-C', '210-A', '130-F'],
        "price": [1200, np.nan, 2133, 5433, np.nan, 4432]
    },
    columns=['id', 'date', 'city', 'category', 'age', 'price'])

## 信息表查看 

###  维度查看

In [3]:
df.shape

(6, 6)

### 数据表基本信息（维度、列名称、数据格式、所占空间等）

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        6 non-null      int64         
 1   date      6 non-null      datetime64[ns]
 2   city      6 non-null      object        
 3   category  6 non-null      object        
 4   age       6 non-null      int64         
 5   price     4 non-null      float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 416.0+ bytes


### 数据表统计信息（平均值，最大值，最小值等）

In [93]:
df[['age','price']].describe()

Unnamed: 0,age,price
count,6.0,4.0
mean,36.5,3299.5
std,10.87658,1966.638503
min,23.0,1200.0
25%,32.0,1899.75
50%,33.0,3282.5
75%,41.5,4682.25
max,54.0,5433.0


### 每一列数据的格式

In [5]:
df.dtypes

id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object

### 某一列格式

In [6]:
df['age'].dtype

dtype('int64')

### 空值

In [7]:
df.isnull()

Unnamed: 0,id,date,city,category,age,price
0,False,False,False,False,False,False
1,False,False,False,False,False,True
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False


### 查看某一列的空值

In [8]:
df['age'].isnull()

0    False
1    False
2    False
3    False
4    False
5    False
Name: age, dtype: bool

### 查看某一列的唯一值

In [9]:
df['age'].unique()

array([23, 44, 54, 32, 34])

### 查看数据表的值

In [10]:
df.values

array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing ', '100-A', 23,
        1200.0],
       [1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],
       [1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A',
        54, 2133.0],
       [1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,
        5433.0],
       [1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,
        nan],
       [1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,
        4432.0]], dtype=object)

### 查看列的名称

In [11]:
df.columns

Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')

### 查看前五行

In [12]:
df.head()

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,


### 后五行数据

In [13]:
df.tail()

Unnamed: 0,id,date,city,category,age,price
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


## 数据表清洗

### 用0填充空值

In [14]:
df.fillna(value=0)

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,0.0
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,0.0
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


### 用price的均值对price的NA进行填充

In [15]:
df['price'].fillna(value=(df['price'].mean()))

0    1200.0
1    3299.5
2    2133.0
3    5433.0
4    3299.5
5    4432.0
Name: price, dtype: float64

### 清除city字段的字符空格

In [16]:
df['city']=df['city'].map(str.strip)

### 大小写转换

In [17]:
df['city'] = df['city'].str.lower()

### 更改数据格式

In [18]:
df['price'].astype('int64')

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

### 更改列名称

In [None]:
df.rename(columns={'category':'category-size'})

### 删除后出现的重复你值

In [None]:
df['city'].drop_duplicates()

### 删除先出现的重复值

In [19]:
df['city'].drop_duplicates(keep='last')

1           sh
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object

### 数据替换

In [20]:
df['city'].replace('sh','shanghai')

0      beijing
1     shanghai
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object

## 数据预处理

In [21]:
df1=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008], 
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})


### 数据表合并

#### merge

In [23]:
df_inner = pd.merge(df,df1,how='inner')
df_inner

Unnamed: 0,id,date,city,category,age,price,gender,pay,m-point
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10
1,1002,2013-01-03,sh,100-B,44,,female,N,12
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40


In [24]:
df_left = pd.merge(df,df1,how='left')
df_left

Unnamed: 0,id,date,city,category,age,price,gender,pay,m-point
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10
1,1002,2013-01-03,sh,100-B,44,,female,N,12
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40


In [26]:
# 笛卡尔积
df_cross = pd.merge(df,df1,how='cross')
df_cross

Unnamed: 0,id_x,date,city,category,age,price,id_y,gender,pay,m-point
0,1001,2013-01-02,beijing,100-A,23,1200.0,1001,male,Y,10
1,1001,2013-01-02,beijing,100-A,23,1200.0,1002,female,N,12
2,1001,2013-01-02,beijing,100-A,23,1200.0,1003,male,Y,20
3,1001,2013-01-02,beijing,100-A,23,1200.0,1004,female,Y,40
4,1001,2013-01-02,beijing,100-A,23,1200.0,1005,male,N,40
5,1001,2013-01-02,beijing,100-A,23,1200.0,1006,female,Y,40
6,1001,2013-01-02,beijing,100-A,23,1200.0,1007,male,N,30
7,1001,2013-01-02,beijing,100-A,23,1200.0,1008,female,Y,20
8,1002,2013-01-03,sh,100-B,44,,1001,male,Y,10
9,1002,2013-01-03,sh,100-B,44,,1002,female,N,12


#### append

In [28]:
result = df.append(df1)
result

  result = df.append(df1)


Unnamed: 0,id,date,city,category,age,price,gender,pay,m-point
0,1001,2013-01-02,beijing,100-A,23.0,1200.0,,,
1,1002,2013-01-03,sh,100-B,44.0,,,,
2,1003,2013-01-04,guangzhou,110-A,54.0,2133.0,,,
3,1004,2013-01-05,shenzhen,110-C,32.0,5433.0,,,
4,1005,2013-01-06,shanghai,210-A,34.0,,,,
5,1006,2013-01-07,beijing,130-F,32.0,4432.0,,,
0,1001,NaT,,,,,male,Y,10.0
1,1002,NaT,,,,,female,N,12.0
2,1003,NaT,,,,,male,Y,20.0
3,1004,NaT,,,,,female,Y,40.0


#### join

In [41]:
df2 = df1[['gender','pay']]
result = df.join(df2)
result

Unnamed: 0,id,date,city,category,age,price,gender,pay
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y
1,1002,2013-01-03,sh,100-B,44,,female,N
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y
4,1005,2013-01-06,shanghai,210-A,34,,male,N
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y


#### concat

In [44]:
df,df1,df2

(     id       date       city category  age   price
 0  1001 2013-01-02    beijing    100-A   23  1200.0
 1  1002 2013-01-03         sh    100-B   44     NaN
 2  1003 2013-01-04  guangzhou    110-A   54  2133.0
 3  1004 2013-01-05   shenzhen    110-C   32  5433.0
 4  1005 2013-01-06   shanghai    210-A   34     NaN
 5  1006 2013-01-07    beijing    130-F   32  4432.0,
      id  gender pay  m-point
 0  1001    male   Y       10
 1  1002  female   N       12
 2  1003    male   Y       20
 3  1004  female   Y       40
 4  1005    male   N       40
 5  1006  female   Y       40
 6  1007    male   N       30
 7  1008  female   Y       20,
    gender pay
 0    male   Y
 1  female   N
 2    male   Y
 3  female   Y
 4    male   N
 5  female   Y
 6    male   N
 7  female   Y)

In [52]:
frames = [df1,df2]
pd.concat(frames,join='inner',axis=1)

Unnamed: 0,id,gender,pay,m-point,gender.1,pay.1
0,1001,male,Y,10,male,Y
1,1002,female,N,12,female,N
2,1003,male,Y,20,male,Y
3,1004,female,Y,40,female,Y
4,1005,male,N,40,male,N
5,1006,female,Y,40,female,Y
6,1007,male,N,30,male,N
7,1008,female,Y,20,female,Y


### 设置索引列

In [53]:
df_inner.set_index('id')

Unnamed: 0_level_0,date,city,category,age,price,gender,pay,m-point
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10
1002,2013-01-03,sh,100-B,44,,female,N,12
1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20
1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40
1005,2013-01-06,shanghai,210-A,34,,male,N,40
1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40


### 按照索引列排序

In [54]:
df_inner.sort_index()

Unnamed: 0,id,date,city,category,age,price,gender,pay,m-point
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10
1,1002,2013-01-03,sh,100-B,44,,female,N,12
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40


### 按照特定列的值排序

In [56]:
df_inner.sort_values(by=['age'])

Unnamed: 0,id,date,city,category,age,price,gender,pay,m-point
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40
1,1002,2013-01-03,sh,100-B,44,,female,N,12
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20


### 如果price列的值>3000，group显示high，否则显示low

In [58]:
df_inner['group'] = np.where(df_inner['price']>3000,'high','low')
df_inner

Unnamed: 0,id,date,city,category,age,price,gender,pay,m-point,group
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high


### 对符合多个条件的数据进行分组标记

0    False
1    False
2    False
3     True
4    False
5     True
Name: price, dtype: bool

In [60]:
df_inner.loc[(df_inner['city']=='beijing') & (df_inner['price']>=4000),'sign'] = 1
df_inner

Unnamed: 0,id,date,city,category,age,price,gender,pay,m-point,group,sign
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high,1.0


### 对category字段的值依次进行分列，并创建数据表，索引值为df_inner的索引列，列名称为category和size

In [69]:
df_inner['category']

0    100-A
1    100-B
2    110-A
3    110-C
4    210-A
5    130-F
Name: category, dtype: object

In [73]:
split = pd.DataFrame((x.split('-') for x in df_inner['category']),index=df_inner.index,columns=['category','size'])
split

Unnamed: 0,category,size
0,100,A
1,100,B
2,110,A
3,110,C
4,210,A
5,130,F


### 将完成分裂后的数据表和原df_inner数据表进行匹配

In [74]:
df_inner = pd.merge(df_inner,split,right_index=True,left_index=True)
df_inner

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low,,100,B
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high,1.0,130,F


## 数据提取

### & “与”

In [80]:
df_inner

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low,,100,B
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high,1.0,130,F


In [79]:
df_inner.loc[((df_inner['age']>25) & (df_inner['city']=='beijing')),['id','city','age','gender']]

Unnamed: 0,id,city,age,gender
5,1006,beijing,32,female


### ｜ “或”

In [81]:
df_inner.loc[((df_inner['age']>25) | (df_inner['city']=='beijing')),['id','city','age','gender']]

Unnamed: 0,id,city,age,gender
0,1001,beijing,23,male
1,1002,sh,44,female
2,1003,guangzhou,54,male
3,1004,shenzhen,32,female
4,1005,shanghai,34,male
5,1006,beijing,32,female


### ！ “非”

In [82]:
df_inner.loc[(df_inner['city']!='beijing'),['id','city','age','gender']]

Unnamed: 0,id,city,age,gender
1,1002,sh,44,female
2,1003,guangzhou,54,male
3,1004,shenzhen,32,female
4,1005,shanghai,34,male


### 对筛选后的数据按city进行计数

In [85]:
df_inner.loc[(df_inner['city']!='beijing'),['id','city','age','gender']].sort_values(['id']).city.count()

4

### 使用query函数进行筛选

In [86]:
df_inner.query('city == ["beijing","shanghai"]')

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high,1.0,130,F


### 对筛选后的结果按price进行求和

In [87]:
df_inner.query('city == ["beijing","shanghai"]').price.sum()

5632.0

## 数据汇总

### 对所有的列进行计数汇总

In [90]:
df_inner

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low,,100,B
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high,1.0,130,F


In [88]:
df_inner.groupby('city').count()

Unnamed: 0_level_0,id,date,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
beijing,2,2,2,2,2,2,2,2,2,1,2,2
guangzhou,1,1,1,1,1,1,1,1,1,0,1,1
sh,1,1,1,1,0,1,1,1,1,0,1,1
shanghai,1,1,1,1,0,1,1,1,1,0,1,1
shenzhen,1,1,1,1,1,1,1,1,1,0,1,1


### 按city对id字段进行计数

In [89]:
df_inner.groupby('city')['id'].count()

city
beijing      2
guangzhou    1
sh           1
shanghai     1
shenzhen     1
Name: id, dtype: int64

### 对两个字段进行汇总计数

In [91]:
df_inner.groupby(['city','size'])['id'].count()

city       size
beijing    A       1
           F       1
guangzhou  A       1
sh         B       1
shanghai   A       1
shenzhen   C       1
Name: id, dtype: int64

### 对city字段进行汇总，并分别计算price对合计和均值

In [92]:
df_inner.groupby('city')['price'].agg([len,np.sum,np.mean])

Unnamed: 0_level_0,len,sum,mean
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
beijing,2,5632.0,2816.0
guangzhou,1,2133.0,2133.0
sh,1,0.0,
shanghai,1,0.0,
shenzhen,1,5433.0,5433.0


## 数据统计

### 简单的数据采样

In [94]:
df_inner.sample(n=3)

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low,,100,B
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A


In [96]:
df_inner

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low,,100,B
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high,1.0,130,F


### 手动设置采样权重

In [100]:
# weights每条数据采样到的概率
weights = [0,0,0,0.2,0.4,0.4]
df_inner.sample(n=2,weights=weights)

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A


### 采样后不放回

In [112]:
# 不允许重复取样
df_inner.sample(n=6,replace=False)

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,Y,40,high,1.0,130,F
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
1,1002,2013-01-03,sh,100-B,44,,female,N,12,low,,100,B
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A


### 采样后放回

In [115]:
# 允许重复取样
df_inner.sample(n=6,replace=True)

Unnamed: 0,id,date,city,category_x,age,price,gender,pay,m-point,group,sign,category_y,size
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,Y,40,high,,110,C
4,1005,2013-01-06,shanghai,210-A,34,,male,N,40,low,,210,A
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,Y,20,low,,110,A
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,Y,10,low,,100,A


### 数据表描述性统计

In [116]:
# round函数设置显示小数位，T表示转置
df_inner.describe().round(2).T 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,6.0,1003.5,1.87,1001.0,1002.25,1003.5,1004.75,1006.0
age,6.0,36.5,10.88,23.0,32.0,33.0,41.5,54.0
price,4.0,3299.5,1966.64,1200.0,1899.75,3282.5,4682.25,5433.0
m-point,6.0,27.0,14.63,10.0,14.0,30.0,40.0,40.0
sign,1.0,1.0,,1.0,1.0,1.0,1.0,1.0


### 计算列的标准差

### 计算两个字段间的协方差

### 数据表中所有字段间的协方差

### 两个字段的相关性分析

### 数据表的相关性分析

## 数据输出

### 写入Excel

### 写入CSV