# Pandas数据分析运用

## Pandas基础操作:   使用pandas获取数据和数据筛选，包括读取CSV，EXCEL和TXT等数据

### 1. 常用数据结构

* series类

In [None]:
import pandas as pd
import numpy as np

In [None]:
# 创建Series对象
# 构造序列
series1 = pd.Series([2.8,3.01,8.99,8.59,5.18])

In [None]:
series1

In [None]:
series2 = pd.Series([2.8,3.01,8.99,8.59,5.18],index = ['a','b','c','d','e'],name ='这是一个series')

In [None]:
series2

In [None]:
series3 = pd.Series(np.array((2.8,3.10,8.99,8.59,5.18)),index = ['a','b','c','d','e'])

In [None]:
series3

In [None]:
series4 = pd.Series({'北京':2.8,'上海':3.01,'广东':8.99,'江苏':8.59,'浙江':5.18})

In [None]:
series4

In [None]:
type(series4)

In [None]:
# series属性
series2.values

In [None]:
series2.index

In [None]:
series2.dtype

In [None]:
series2.shape

In [None]:
series2.ndim

In [None]:
series2.size

In [None]:
# 访问series
series4[0]

In [None]:
series4[0:4]# 通过位置访问

In [None]:
series4['北京':'广东'] #通过标签或者索引名称

In [None]:
# 数据的更新，插入和删除
series4['北京'] = 2.81

In [None]:
series4

In [None]:
series5 = pd.Series({'四川':3.80,'重庆': 2.01})

In [None]:
series5

In [None]:
series6 = series4.append(series5)

In [None]:
series6

In [None]:
series6.drop('四川',inplace = True) #记得加inplace=True,否则不会作用在原数据上

* dataFrame类

In [None]:
# 构造数据框
# 数据框其实就是一个二维表结构，是数据分析中，最常用的数据结构
list1 = [['张三',23,'男'],['李四',27,'女'],['王二',26,'女']]#使用嵌套列表
df1 = pd.DataFrame(list1,columns=['姓名','年龄','性别'])

In [None]:
type(df1)

In [None]:
df2 = pd.DataFrame({'姓名':['张三','李四','王二'],'年龄':[23,27,26],'性别':['男','女','女']}) #使用字典,字典的键被当成列名

In [None]:
df2

In [None]:
array1 = np.array([['张三',23,'男'],['李四',27,'女'],['王二', 26,'女']])# 使用numpy
df3 = pd.DataFrame(array1,columns=['姓名','年龄','性别'],index = ['a','b','c'] )

In [None]:
#dataframe属性
df2.values

In [None]:
df2.index

In [None]:
df2.columns

In [None]:
df2.dtypes

In [None]:
df2.ndim

In [None]:
df2.size

### 2. 数据获取和保存

* CSV文件读取
* read_csv(file,sep, header,names ,index_col,dtype,nrows,encoding)
* 可以先用os改变数据存放路劲

In [None]:
import os
os.chdir(r'E:\云开明培训机构\云开见明培训课件\data summary\第三章')#数据存放路径(改为)

In [None]:
df = pd.read_csv('meal_order_info.csv',encoding = 'gbk')# gbk是中文编码，utf-8是默认编码

In [None]:
df.dtypes

In [None]:
df = pd.read_csv('meal_order_info.csv',encoding = 'gbk', dtype = {'info_id':str,'emp_id':str,'phone':str})

In [None]:
df = pd.read_csv('meal_order_info.csv',encoding = 'gbk', dtype = {'info_id':str,'emp_id':str,'phone':str},nrows =10)

In [None]:
df

* 读取excel文件
* excel相比CSV，参数都差不多，读取EXCEL需要考虑工作表名称

In [None]:
df1 = pd.read_excel('meal_order_detail.xlsx',encoding = 'gbk',sheet_name = 'meal_order_detail1',dtype = {'detail_id':str,'order_id':str,'dishes_id':str})# gbk是中文编码，utf-8是默认编码

In [None]:
df

In [None]:
sheet_names = ['meal_order_detail' + str(i) for i in range(1,4)]

In [None]:
sheet_names

In [None]:
#批量读取Excel中的所有工作表
data_all = pd.DataFrame()
for i in sheet_names:
    data  = pd.read_excel('meal_order_detail.xlsx',header=0,sheet_name=i,encoding='gbk')
    data_all = pd.concat([data_all,data],axis=0,ignore_index=True)#合并

In [None]:
data_all.shape

* 数据保存

In [None]:
df.to_csv('a1.csv', index=False,encoding='gbk')  # 不写出索引列

In [None]:
df.to_excel('a1.xlsx', sheet_name='a1', index=False)

 * 数据筛选

In [None]:
order = pd.read_excel('meal_order_detail.xlsx',encoding = 'gbk',sheet_name = 'meal_order_detail1',dtype = {'detail_id':str,'order_id':str,'dishes_id':str})# gbk是中文编码，utf-8是默认编码

In [None]:
order.head(5) #查看前5行

In [None]:
order.tail(5) #查看后5行

In [None]:
order.columns #查看变量名称

In [None]:
order.dtypes #查看变量类型

In [None]:
print('订单数据的元素个数为:', order.size)
print('订单数据的维度个数为:', order.ndim)
print('订单数据的形状为:', order.shape)

In [None]:
order[:5] #前5行

In [None]:
order['dishes_name'][:5]  #选择该变量的前5行

In [None]:
order['dishes_name'] = order['dishes_name'].apply(lambda  x: str(x).rstrip()) #消除右边空格

In [None]:
order[['order_id','dishes_name']][:5]   

In [None]:
# 选择列
order['dishes_name']
#order.dishes_name

In [None]:
order[['order_id','dishes_name']]#选择多个变量

In [None]:
# 比较loc和iloc的用法

In [None]:
order.loc[:,'dishes_name'] #选择某一列

In [None]:
order.loc[:,['order_id','dishes_name']] #选择多列

In [None]:
order.loc[0:2,['order_id','dishes_name']] #选择对应的行名称和多列

In [None]:
order.loc[order['order_id'] ==458,['order_id','dishes_name']] #按照条件选择

In [None]:
order.loc[3:5,['order_id','dishes_name']] #选择对应的行名称和多列

In [None]:
order.loc[3:5] 

In [None]:
order.iloc[:,1:4] #按照位置来选择第二列到第四列

In [None]:
order.iloc[:,[0,2]] # 按照位置来选择第1列和第3列

In [None]:
order.iloc[3,[1,2]] #选择第4行，第2列和第3列数据

In [None]:
order.iloc[2:7,[1,2]] #选择第3行到第7行，第2列和第3列数据

In [None]:
order.loc[0:6] #选择数据前7行,loc选择针对的是索引名称

In [None]:
order.iloc[0:6] #选择数据前6行,iloc选择针对的是位置

In [None]:
# 条件查询
order[order.order_id==458] #order_id=458的所有数据

In [224]:
order[['dishes_id', 'dishes_name']][order.order_id==458]

Unnamed: 0,Mid,detail_id,order_id,dishes_id,logicprn_name,parent_class_name,dishes_name,itemis_add,counts,amounts,cost,place_order_time,discount_amt,discount_reason,kick_back,add_inprice,add_info,bar_code,picture_file


In [None]:
order[['dishes_id', 'dishes_name']][(order.order_id==458) & (order['amounts'] >3)] #    &代表and

In [None]:
order[['dishes_id', 'dishes_name']][(order.order_id==458) | (order['amounts'] >3)] #    |代表或

In [None]:
order[['dishes_id', 'dishes_name']][~(order.order_id==458)] # ~代表非

In [None]:
#使用between 
order[['dishes_id', 'dishes_name','amounts']][order['amounts'].between(10,30,inclusive=True)]

In [None]:
#使用pd.isin()
order[['dishes_id', 'dishes_name']][order['dishes_name'].isin(['蒙古烤羊腿','大蒜苋菜'])] # dishes_name里面带有['蒙古烤羊腿','大蒜苋菜']

In [None]:
order[['dishes_id', 'dishes_name']][order['dishes_name'].str.contains('烤')] #带烤字的条件

*  增删改查
* drop(labels,axis,inplace =True)
* labels表示删除行或者列的标签,axis表示行或列，inplace=True表示是否对原数据生效)
* drop(axis = 0) 指按行操作
* drop(axis = 1) 指按列操作


In [None]:
order['payment'] = order['counts']* order['amounts'] #增加一列

In [None]:
order['pay_way'] ='现金支付' #增加一列

In [None]:
order.drop('pay_way',axis=1,inplace=True) #删除这一列 ,加inplace代表是否在原数据上操作

In [None]:
#或者使用
del order['pay_way']

* 假设我们希望emp_id位于第一列，怎么办
* df.insert(位置,变量名称，取值)

In [None]:
mid = order['emp_id'] 
order.drop(labels=['emp_id'], axis=1,inplace = True)
#先将这一列取出来，命名为mid,然后在数据中删除这一列，再将其添加进去

In [None]:
order.insert(0, 'Mid', mid)

In [None]:
#查看
order.head(50) 

In [None]:
# 同时删除多个变量

In [None]:
order.drop(['pay_way','payment'],axis=1,inplace=True) #删除这两列,加inplace代表是否在原数据上操作

In [None]:
order.columns #查看现有的字段名

In [None]:
# 按行删除

In [None]:
order.drop(labels = [3,4],inplace = True) # 删除索引为2对应的行

In [None]:
order.drop(labels= range(1,11),axis=0,inplace=True) #删除1到10行

In [221]:
# 修改数据

In [None]:
order.loc[order['order_id']==458,'order_id'] = 45800 #修改数据（按照条件)

In [None]:
order[order['order_id'] ==45800].head(10)

In [None]:
#修改数据
order.rename(columns = {'amounts':'payment'},inplace = True)

In [None]:
order.describe().loc['count'] == 0 #判断计数是否为0