# Task Special 综合练习

In [1]:
import pandas as pd
import numpy as np

## 任务一 企业收入的多样性

【题目描述】  
一个企业的产业收入多样性可以仿照信息熵的概念来定义收入熵指标：$$I=-\sum_{i}p(x_i)\log(p(x_i))$$
其中$p(x_i)$是企业该年某产业收入额占该年所有产业总收入的比重。在`company.csv`中存有需要计算的企业和年份，在`company_data.csv`中存有企业、各类收入额和收入年份的信息。现请利用后一张表中的数据，在前一张表中增加一列表示该公司该年份的收入熵指标$I$。

【数据下载】  
链接：https://pan.baidu.com/s/1leZZctxMUSW55kZY5WwgIw   
密码：u6fd

**解答：**

In [2]:
# 读取Company.csv数据
company = pd.read_csv('../data/task_special/task01/Company.csv')
company.columns = ['Code', 'Date']
company.head()

Unnamed: 0,Code,Date
0,#000007,2014
1,#000403,2015
2,#000408,2016
3,#000408,2017
4,#000426,2015


In [3]:
# 读取Company_data.csv数据
company_data = pd.read_csv('../data/task_special/task01/Company_data.csv')
company_data.columns = ['Code', 'Date', 'Type', 'Amount']
company_data['Date'] = pd.to_datetime(company_data['Date'])
company_data.head()

Unnamed: 0,Code,Date,Type,Amount
0,1,2008-12-31,1,10842180000.0
1,1,2008-12-31,2,12597890000.0
2,1,2008-12-31,3,14513120000.0
3,1,2008-12-31,4,1063843000.0
4,1,2008-12-31,5,851388000.0


In [4]:
# 数据清洗
# 1. 将负数取绝对值
company_data['Amount'] = company_data['Amount'].abs()
# 2. 将空值删除
company_data = company_data.dropna(how='any', subset=['Amount'])
# 3. 取出Date列中的年
company_data['Date'] = company_data['Date'].dt.year

In [5]:
company_data.head()

Unnamed: 0,Code,Date,Type,Amount
0,1,2008,1,10842180000.0
1,1,2008,2,12597890000.0
2,1,2008,3,14513120000.0
3,1,2008,4,1063843000.0
4,1,2008,5,851388000.0


In [6]:
# 得到比重值
company_data['Px'] = company_data.groupby(['Code', 'Date'])['Amount'].apply(lambda x: x/x.sum())

In [7]:
# 根据公式计算I值
company_data_tmp = company_data.groupby(['Code', 'Date'])['Px'].agg(lambda x: -sum([p * np.log(p)  for p in x.tolist()])).to_frame()

  
  


In [8]:
# 去掉索引，将Code格式化
company_data_tmp = company_data_tmp.reset_index()
company_data_tmp['Code'] = company_data_tmp['Code'].apply(lambda x: '#{0:0>6}'.format(x))
company_data_tmp.head()

Unnamed: 0,Code,Date,Px
0,#000001,2008,2.125238
1,#000001,2009,1.671752
2,#000001,2010,2.108355
3,#000001,2011,3.155371
4,#000001,2012,2.738493


In [9]:
# 进行连接
res = company.merge(company_data_tmp, on=['Code', 'Date'], how='left')
# 将列重命名
res = res.rename(columns={'Code':'证券代码', 'Date':'日期', 'Px':'收入熵指标'})
res.head()

Unnamed: 0,证券代码,日期,收入熵指标
0,#000007,2014,
1,#000403,2015,2.790585
2,#000408,2016,
3,#000408,2017,
4,#000426,2015,


In [10]:
# 保存数据
res.to_csv('../data/task_special/task01/task01_result.csv', index=False)

## 任务二组队学习信息表的变换

【题目描述】  
请把组队学习的队伍信息表变换为如下形态，其中“是否队长”一列取1表示队长，否则为0

<img src="../source/_static/ch_special.png" width="40%">

【数据下载】   
链接：https://pan.baidu.com/s/1ses24cTwUCbMx3rvYXaz-Q  
密码：iz57

**解答：**

In [11]:
team_data = pd.read_excel('../data/task_special/task02/组队信息汇总表（Pandas）.xls')
team_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 24 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   所在群       21 non-null     object 
 1   队伍名称      21 non-null     object 
 2   队长编号      21 non-null     int64  
 3   队长_群昵称    21 non-null     object 
 4   队员1 编号    21 non-null     int64  
 5   队员_群昵称    21 non-null     object 
 6   队员2 编号    20 non-null     float64
 7   队员_群昵称.1  20 non-null     object 
 8   队员3 编号    18 non-null     float64
 9   队员_群昵称.2  18 non-null     object 
 10  队员4 编号    16 non-null     float64
 11  队员_群昵称.3  16 non-null     object 
 12  队员5 编号    14 non-null     float64
 13  队员_群昵称.4  14 non-null     object 
 14  队员6 编号    13 non-null     float64
 15  队员_群昵称.5  13 non-null     object 
 16  队员7 编号    10 non-null     float64
 17  队员_群昵称.6  10 non-null     object 
 18  队员8 编号    8 non-null      float64
 19  队员_群昵称.7  8 non-null      object 
 20  队员9 编号    4 non-null      float64


## 任务三 美国大选投票情况

【题目描述】  
两张数据表中分别给出了美国各县（county）的人口数以及大选的投票情况，请解决以下问题：
- 有多少县满足总投票数超过县人口数的一半
- 把州（state）作为行索引，把投票候选人作为列名，列名的顺序按照候选人在全美的总票数由高到低排序，行列对应的元素为该候选人在该州获得的总票数

    \# 此处是一个样例，实际的州或人名用原表的英语代替

||拜登|川普|
|---|---|---|
|威斯康星州|2|1|
|德克萨斯州|3|4|

- 每一个州下设若干县，定义拜登在该县的得票率减去川普在该县的得票率为该县的BT指标，若某个州所有县BT指标的中位数大于0，则称该州为Biden State，请找出所有的Biden State

【数据下载】  
链接：https://pan.baidu.com/s/182rr3CpstVux2CFdFd_Pcg  
提取码：q674

**解答：**

**第1问：有多少县满足总投票数超过县人口数的一半？**

In [12]:
# 读取county_population.csv
county_population = pd.read_csv('../data/task_special/task03/county_population.csv')
# 分隔US County列
county_population['county'], county_population['state'] = county_population['US County'].str.split(',.', 1).str
county_population['county'] = county_population['county'].str[1:]
# 删除US County列
county_population = county_population.drop(columns='US County')
county_population.head()

  after removing the cwd from sys.path.


Unnamed: 0,Population,county,state
0,55869,Autauga County,Alabama
1,223234,Baldwin County,Alabama
2,24686,Barbour County,Alabama
3,22394,Bibb County,Alabama
4,57826,Blount County,Alabama


In [13]:
# 读取president_county_candidate.csv
president_county_candidate = pd.read_csv('../data/task_special/task03/president_county_candidate.csv')
president_county_candidate.head()

Unnamed: 0,state,county,candidate,party,total_votes,won
0,Delaware,Kent County,Joe Biden,DEM,44552,True
1,Delaware,Kent County,Donald Trump,REP,41009,False
2,Delaware,Kent County,Jo Jorgensen,LIB,1044,False
3,Delaware,Kent County,Howie Hawkins,GRN,420,False
4,Delaware,New Castle County,Joe Biden,DEM,195034,True


In [14]:
president_county_total_votes = president_county_candidate.groupby(['state', 'county'])['total_votes'].sum().to_frame()

In [15]:
county = county_population.merge(president_county_total_votes, on=['state', 'county'], how='left')
county.head()

Unnamed: 0,Population,county,state,total_votes
0,55869,Autauga County,Alabama,27770.0
1,223234,Baldwin County,Alabama,109679.0
2,24686,Barbour County,Alabama,10518.0
3,22394,Bibb County,Alabama,9595.0
4,57826,Blount County,Alabama,27588.0


In [16]:
county[county['total_votes'] * 2 > county['Population']].shape[0]

1434

有1434个县满足总投票数超过县人口数的一半

**第2问：计算候选人在各州的总票数**

In [17]:
# 计算候选人在各州的总票数
candidate_votes = president_county_candidate.pivot_table(index = 'state', columns = 'candidate', values = 'total_votes', aggfunc = 'sum', margins=True)

In [18]:
# 候选人在全美的总票数排序
candidate_votes = candidate_votes.T.sort_values(['All'], ascending=False).T

In [19]:
# 删除边际索引
candidate_votes.drop(index='All', columns='All', inplace=True)

In [20]:
# nan填充0
candidate_votes.fillna(value=0, inplace=True)

In [21]:
# 删除多余的索引名和列名
candidate_votes.index.name = ""
candidate_votes.columns.name = ""

In [22]:
candidate_votes

Unnamed: 0,Joe Biden,Donald Trump,Jo Jorgensen,Howie Hawkins,Write-ins,Rocky De La Fuente,Gloria La Riva,Kanye West,Don Blankenship,Brock Pierce,...,Tom Hoefling,Ricki Sue King,Princess Jacob-Fambro,Blake Huber,Richard Duncan,Joseph Kishore,Jordan Scott,Gary Swing,Keith McCormic,Zachary Scalf
,,,,,,,,,,,,,,,,,,,,,
Alabama,849648.0,1441168.0,25176.0,0.0,7312.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alaska,153405.0,189892.0,8896.0,0.0,34210.0,318.0,0.0,0.0,1127.0,825.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Arizona,1672143.0,1661686.0,51465.0,0.0,2032.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Arkansas,423932.0,760647.0,13133.0,2980.0,0.0,1321.0,1336.0,4099.0,2108.0,2141.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
California,11109764.0,6005961.0,187885.0,81025.0,80.0,60155.0,51036.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Colorado,1804352.0,1364607.0,52460.0,8986.0,0.0,636.0,1035.0,8089.0,5061.0,572.0,...,0.0,0.0,495.0,355.0,0.0,196.0,175.0,0.0,0.0,0.0
Connecticut,1080680.0,715291.0,20227.0,7538.0,544.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Delaware,296268.0,200603.0,5000.0,2139.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
District of Columbia,317323.0,18586.0,2036.0,1726.0,3137.0,0.0,855.0,0.0,0.0,693.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**第3问：找出所有的Biden State**