In [1]:
import pandas as pd
import numpy as py

### 【任务一】企业收入的多样性

目标：计算company.csv中每个证券的收入熵指标

实现步骤：
* 1、按年份+证券代码计算收入熵
* 2、两表以证券代码拼接在一起，值得注意的是company.csv中证券代码是 字符串的”#000007“，表company_data.csv中证券代码是int类型 ”7“，拼接前需要处理一下。

In [129]:
df1 = pd.read_csv('../data/company.csv')
df1.head()

Unnamed: 0,证券代码,日期
0,#000007,2014
1,#000403,2015
2,#000408,2016
3,#000408,2017
4,#000426,2015


In [130]:
# 处理数据1

df1['证券代码'] = df1['证券代码'].apply(lambda x:int(x[1:]))
df1.head()

Unnamed: 0,证券代码,日期
0,7,2014
1,403,2015
2,408,2016
3,408,2017
4,426,2015


In [131]:
df2 = pd.read_csv('../data/company_data.csv')
df2.head()

Unnamed: 0,证券代码,日期,收入类型,收入额
0,1,2008/12/31,1,10842180000.0
1,1,2008/12/31,2,12597890000.0
2,1,2008/12/31,3,14513120000.0
3,1,2008/12/31,4,1063843000.0
4,1,2008/12/31,5,851388000.0


In [132]:
# 处理数据1


df2['日期'] = df2['日期'].apply(lambda x:int(x[:4]))
df2 = df2[df2['收入额']>0]
df2.head()

Unnamed: 0,证券代码,日期,收入类型,收入额
0,1,2008,1,10842180000.0
1,1,2008,2,12597890000.0
2,1,2008,3,14513120000.0
3,1,2008,4,1063843000.0
4,1,2008,5,851388000.0


* 计算收入熵

In [133]:
# 按证券代码+日期年份计算收入熵

def calcu_entropy(x):
    x_sum = sum(x['收入额'])
    px = x['收入额']/x_sum
    px_sum = sum(-1*px*np.log(px))
    
    return px_sum

df2_groups = df2.groupby(['证券代码','日期'])['收入类型','收入额']


df2_entropy= df2_groups.apply(calcu_entropy)
df2_entropy.head()

  # Remove the CWD from sys.path while we load stuff.


证券代码  日期  
1     2008    2.068035
      2009    1.671752
      2010    2.108355
      2011    3.137535
      2012    2.698959
dtype: float64

In [134]:
df2_entropy = df2_entropy.reset_index()
df2_entropy.head()

Unnamed: 0,证券代码,日期,0
0,1,2008,2.068035
1,1,2009,1.671752
2,1,2010,2.108355
3,1,2011,3.137535
4,1,2012,2.698959


In [135]:
df2_entropy.columns = ['证券代码', '日期', "收入熵"]
df2_entropy.head()

Unnamed: 0,证券代码,日期,收入熵
0,1,2008,2.068035
1,1,2009,1.671752
2,1,2010,2.108355
3,1,2011,3.137535
4,1,2012,2.698959


In [136]:
df1 = df1.merge(df2_entropy,
        on=['证券代码','日期'],
        how='inner')

Unnamed: 0,证券代码,日期,收入熵
0,7,2014,3.070462
1,403,2015,2.790585
2,408,2016,2.818541
3,426,2015,3.084266
4,426,2016,2.988900
...,...,...,...
950,600408,2013,3.813744
951,600978,2011,3.319059
952,600978,2014,2.788100
953,600978,2015,3.012628


## 任务2 组队学习信息表的变换

* 请把组队学习的队伍信息表变换为如下形态，其中“是否队长”一列取1表示队长，否则为0

In [2]:
# 读取数据

df = pd.read_excel('../data/组队信息汇总表（Pandas）.xlsx')
df.head()

Unnamed: 0,所在群,队伍名称,队长编号,队长_群昵称,队员1 编号,队员_群昵称,队员2 编号,队员_群昵称.1,队员3 编号,队员_群昵称.2,...,队员6 编号,队员_群昵称.5,队员7 编号,队员_群昵称.6,队员8 编号,队员_群昵称.7,队员9 编号,队员_群昵称.8,队员10编号,队员_群昵称.9
0,Pandas数据分析,你说的都对队,5,山枫叶纷飞,6,蔡,7.0,安慕希,8.0,信仰,...,,,,,,,,,,
1,Pandas数据分析,熊猫人,175,鱼呲呲,44,Heaven,37.0,吕青,50.0,余柳成荫,...,25.0,Never say never,55.0,K,120.0,Y.,28.0,X.Y.Q,151.0,swrong
2,Pandas数据分析,中国移不动,107,Y's,124,🥕,75.0,Vito,146.0,张小五,...,,,,,,,,,,
3,Pandas数据分析,panda,11,太下真君,35,柚子,108.0,My,42.0,星星点灯,...,157.0,Zys,158.0,不器,102.0,嘉平佑染,,,,
4,Pandas数据分析,一路向北,13,黄元帅,15,化,16.0,未期,18.0,太陽光下,...,23.0,🚀,169.0,听风,189.0,Cappuccino,,,,


In [34]:
# 处理数据

df.rename(lambda x :  '队员_群昵称.0' if x=='队员_群昵称' else x,
               axis=1,
          inplace=True
              )
df.head()

Unnamed: 0,所在群,队伍名称,队长编号,队长_群昵称,队员1 编号,队员_群昵称.0,队员2 编号,队员_群昵称.1,队员3 编号,队员_群昵称.2,...,队员6 编号,队员_群昵称.5,队员7 编号,队员_群昵称.6,队员8 编号,队员_群昵称.7,队员9 编号,队员_群昵称.8,队员10编号,队员_群昵称.9
0,Pandas数据分析,你说的都对队,5,山枫叶纷飞,6,蔡,7.0,安慕希,8.0,信仰,...,,,,,,,,,,
1,Pandas数据分析,熊猫人,175,鱼呲呲,44,Heaven,37.0,吕青,50.0,余柳成荫,...,25.0,Never say never,55.0,K,120.0,Y.,28.0,X.Y.Q,151.0,swrong
2,Pandas数据分析,中国移不动,107,Y's,124,🥕,75.0,Vito,146.0,张小五,...,,,,,,,,,,
3,Pandas数据分析,panda,11,太下真君,35,柚子,108.0,My,42.0,星星点灯,...,157.0,Zys,158.0,不器,102.0,嘉平佑染,,,,
4,Pandas数据分析,一路向北,13,黄元帅,15,化,16.0,未期,18.0,太陽光下,...,23.0,🚀,169.0,听风,189.0,Cappuccino,,,,


**解题思路**

* 分批转化得到组队信息
* 拼接数据
* 处理数据空值

In [38]:
# 取队长数据
df_cap = df.melt(id_vars=['队伍名称','队长编号'],
                 value_vars=['队长_群昵称'],
                 value_name='昵称',
                 var_name = '是否队长'
                )

df_cap['是否队长']=1
df_cap.rename({'队长编号':'编号'},
             axis = 1,
             inplace = True)
df_cap.head()

Unnamed: 0,队伍名称,编号,是否队长,昵称
0,你说的都对队,5,1,山枫叶纷飞
1,熊猫人,175,1,鱼呲呲
2,中国移不动,107,1,Y's
3,panda,11,1,太下真君
4,一路向北,13,1,黄元帅


In [57]:
# 取队员数据 并合并
df_result = df_cap.copy()
team = '队员%d 编号'
team_name = '队员_群昵称.%d'

for idx in range(9):
    
    df_c = df.melt(id_vars=['队伍名称',team%(idx+1)],
                 value_vars=[team_name%(idx)],
                 value_name='昵称',
                 var_name = '是否队长'
                )

    df_c['是否队长']=0
    df_c.rename({team%(idx+1):'编号'},
               axis =1,
               inplace = True)

    df_result = pd.concat([df_result,df_c])

# 
df_result

Unnamed: 0,队伍名称,编号,是否队长,昵称
0,你说的都对队,5.0,1,山枫叶纷飞
1,熊猫人,175.0,1,鱼呲呲
2,中国移不动,107.0,1,Y's
3,panda,11.0,1,太下真君
4,一路向北,13.0,1,黄元帅
...,...,...,...,...
16,pandas从入门到精通,,0,
17,Attention！keep干饭,,0,
18,Null,,0,
19,七星联盟,,0,


In [59]:
## 处理最终的数据

# 
df_result = df_result.dropna()
df_result['编号'] = df_result['编号'].astype(np.int)

# 
df_result.sort_values(by='队伍名称',
                     inplace =True)


df_result

Unnamed: 0,队伍名称,编号,是否队长,昵称
17,Attention！keep干饭,104,0,梦想家
17,Attention！keep干饭,152,0,Alex
17,Attention！keep干饭,21,1,阿芒Aris
17,Attention！keep干饭,95,0,Jie
10,Kung Fu Pandas,167,0,swordsman
...,...,...,...,...
5,西部战车,51,0,Robin or Michael
5,西部战车,160,0,哦豁
13,鲲鲲玩Python,187,0,冻草莓
13,鲲鲲玩Python,26,1,木南居士


## 任务三 美国大选投票情况



In [62]:
df1 = pd.read_csv('../data/president_county_candidate.csv')

df1.head()


Unnamed: 0,state,county,candidate,party,total_votes,won
0,Delaware,Kent County,Joe Biden,DEM,44552,True
1,Delaware,Kent County,Donald Trump,REP,41009,False
2,Delaware,Kent County,Jo Jorgensen,LIB,1044,False
3,Delaware,Kent County,Howie Hawkins,GRN,420,False
4,Delaware,New Castle County,Joe Biden,DEM,195034,True


In [127]:
df2 = pd.read_csv('../data/county_population.csv')
df2.head()

Unnamed: 0,US County,Population
0,".Autauga County, Alabama",55869
1,".Baldwin County, Alabama",223234
2,".Barbour County, Alabama",24686
3,".Bibb County, Alabama",22394
4,".Blount County, Alabama",57826


* **【第一问】** 有多少县满足总投票数超过县人口数的一半

 解题思路：

    * 将county_population.csv 中“US County”写为 ”state“和”county“
    * 将president_county_candidate.csv 中按照 ”state“和”county“ 求总投票人数
    * 两个表合并，返回满足投票数超过县人口数一半的县

In [128]:
# 

def split_state(x):
    state_county = x.split(',')
    return state_county[1].lstrip()


def split_county(x):
    state_county = x.split(',')
    return state_county[0][1:]  ## 这里如果同时返回两个参数要怎么处理？

df2['state'] = df2['US County'].apply(split_state)
df2['county'] = df2['US County'].apply(split_county)


df2 = df2.loc[:,['state','county','Population']]
df2.head()

Unnamed: 0,state,county,Population
0,Alabama,Autauga County,55869
1,Alabama,Baldwin County,223234
2,Alabama,Barbour County,24686
3,Alabama,Bibb County,22394
4,Alabama,Blount County,57826


In [115]:
# 

df1_sum = df1.groupby(by=['state','county'])['total_votes'].sum()
df1_sum = df1_sum.reset_index()
df1_sum.head()

Unnamed: 0,state,county,total_votes
0,Alabama,Autauga County,27770
1,Alabama,Baldwin County,109679
2,Alabama,Barbour County,10518
3,Alabama,Bibb County,9595
4,Alabama,Blount County,27588


In [134]:
# 

df_result1 = df1_sum.merge(df2,
              on=['state','county'],
              how='inner')

df_result1 = df_result1[df_result1['total_votes']>0.5*df_result1['Population']]

df_result1.head()

Unnamed: 0,state,county,total_votes,Population
11,Alabama,Choctaw County,7464,12589
12,Alabama,Clarke County,13135,23622
13,Alabama,Clay County,6930,13235
16,Alabama,Colbert County,27886,55241
17,Alabama,Conecuh County,6441,12067


* **【第二问】** 把州（state）作为行索引，把投票候选人作为列名，列名的顺序按照候选人在全美的总票数由高到低排序，行列对应的元素为该候选人在该州获得的总票数

 解题思路：

    * 按 ”州“ + ”投票候选人“ 统计总票数
    * 将州作为行索引，候选人作为列名，得票数作为值，重构表结构
    * 排序

In [135]:
#
df1 = pd.read_csv('../data/president_county_candidate.csv')
df1.head()

Unnamed: 0,state,county,candidate,party,total_votes,won
0,Delaware,Kent County,Joe Biden,DEM,44552,True
1,Delaware,Kent County,Donald Trump,REP,41009,False
2,Delaware,Kent County,Jo Jorgensen,LIB,1044,False
3,Delaware,Kent County,Howie Hawkins,GRN,420,False
4,Delaware,New Castle County,Joe Biden,DEM,195034,True


In [186]:
# 
df1_votes = df1.groupby(by = ['state','candidate'])['total_votes'].sum()

df1_votes= df1_votes.reset_index()
df1_votes.head()



Unnamed: 0,state,candidate,total_votes
0,Alabama,Write-ins,7312
1,Alabama,Donald Trump,1441168
2,Alabama,Jo Jorgensen,25176
3,Alabama,Joe Biden,849648
4,Alaska,Write-ins,34210


In [187]:
# 

df1_votes = df1_votes.pivot_table(index = 'state',
                            columns='candidate',
                            values='total_votes',
                            aggfunc = sum,
                            margins=True)



In [188]:
df1_votes.sort_values(by='All',
                      axis =1 ,
                      ascending=False,
                      inplace=True)
df1_votes = df1_votes.drop(columns='All')

In [191]:
df1_votes = df1_votes.fillna(0)
df1_votes

candidate,Joe Biden,Donald Trump,Jo Jorgensen,Howie Hawkins,Write-ins,Rocky De La Fuente,Gloria La Riva,Kanye West,Don Blankenship,Brock Pierce,...,Tom Hoefling,Ricki Sue King,Princess Jacob-Fambro,Blake Huber,Richard Duncan,Joseph Kishore,Jordan Scott,Gary Swing,Keith McCormic,Zachary Scalf
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,849648.0,1441168.0,25176.0,0.0,7312.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alaska,153405.0,189892.0,8896.0,0.0,34210.0,318.0,0.0,0.0,1127.0,825.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Arizona,1672143.0,1661686.0,51465.0,0.0,2032.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Arkansas,423932.0,760647.0,13133.0,2980.0,0.0,1321.0,1336.0,4099.0,2108.0,2141.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
California,11109764.0,6005961.0,187885.0,81025.0,80.0,60155.0,51036.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Colorado,1804352.0,1364607.0,52460.0,8986.0,0.0,636.0,1035.0,8089.0,5061.0,572.0,...,0.0,0.0,495.0,355.0,0.0,196.0,175.0,0.0,0.0,0.0
Connecticut,1080680.0,715291.0,20227.0,7538.0,544.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Delaware,296268.0,200603.0,5000.0,2139.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
District of Columbia,317323.0,18586.0,2036.0,1726.0,3137.0,0.0,855.0,0.0,0.0,693.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Florida,5297045.0,5668731.0,70324.0,14721.0,1055.0,5966.0,5712.0,0.0,3902.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* **【第三问】** 每一个州下设若干县，定义拜登在该县的得票率减去川普在该县的得票率为该县的BT指标，若某个州所有县BT指标的中位数大于0，则称该州为Biden State，请找出所有的Biden State
 解题思路：

    * 计算所有县的BT指标
    * 计算州的中位数，大于0为Biden State

In [257]:
#
df1 = pd.read_csv('../data/president_county_candidate.csv')
df1.head()

Unnamed: 0,state,county,candidate,party,total_votes,won
0,Delaware,Kent County,Joe Biden,DEM,44552,True
1,Delaware,Kent County,Donald Trump,REP,41009,False
2,Delaware,Kent County,Jo Jorgensen,LIB,1044,False
3,Delaware,Kent County,Howie Hawkins,GRN,420,False
4,Delaware,New Castle County,Joe Biden,DEM,195034,True


In [258]:
df1_biden = df1.loc[df1['candidate'].isin(['Joe Biden','Donald Trump'])]

df1_biden = df1_biden.pivot_table(index = ['state','county'],
                                columns = 'candidate',
                                  values='total_votes',
                                  aggfunc =sum,
                                  margins=True)

df1_biden = df1_biden.drop(index = 'All')
df1_biden = df1_biden.reset_index()

df1_biden.head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


candidate,state,county,Donald Trump,Joe Biden,All
0,Alabama,Autauga County,19838,7503,27341
1,Alabama,Baldwin County,83544,24578,108122
2,Alabama,Barbour County,5622,4816,10438
3,Alabama,Bibb County,7525,1986,9511
4,Alabama,Blount County,24711,2640,27351


In [259]:
# 

df1_biden['Donald Trump'] = df1_biden['Donald Trump']/df1_biden['All']
df1_biden['Joe Biden'] = df1_biden['Joe Biden']/df1_biden['All']

df1_biden['All'] = df1_biden['Joe Biden'] - df1_biden['Donald Trump'] 

df1_biden.columns = ['state','county','Donald Trump','Joe Biden','BT']

df1_biden.head()

Unnamed: 0,state,county,Donald Trump,Joe Biden,BT
0,Alabama,Autauga County,0.725577,0.274423,-0.451154
1,Alabama,Baldwin County,0.772683,0.227317,-0.545365
2,Alabama,Barbour County,0.538609,0.461391,-0.077218
3,Alabama,Bibb County,0.791189,0.208811,-0.582378
4,Alabama,Blount County,0.903477,0.096523,-0.806954


In [260]:
df1_biden = df1_biden.groupby(by='state')['BT'].median()
df1_biden = df1_biden.reset_index()

df1_biden.loc[df1_biden['BT']>0,]

Unnamed: 0,state,BT
4,California,0.086698
6,Connecticut,0.084159
7,Delaware,0.041409
8,District of Columbia,0.914602
11,Hawaii,0.328289
21,Massachusetts,0.257463
30,New Jersey,0.067778
39,Rhode Island,0.139221
45,Vermont,0.250821
