# numpy(advanced)
* 行列計算に特化したモジュール
* pythonの標準ライブラリではないのでimportが必要

## numpy array

### 基本

In [37]:
import numpy as np

In [38]:
a = np.array(range(0, 6))
a

array([0, 1, 2, 3, 4, 5])

In [39]:
type(a)

numpy.ndarray

In [40]:
a.shape

(6,)

**.reshape()で好きな形に変形できる**

In [41]:
# 3行×2列に変換
b = a.reshape(3, 2)
b

array([[0, 1],
       [2, 3],
       [4, 5]])

**.flatten()で1行に変換できる**

In [42]:
b.flatten()

array([0, 1, 2, 3, 4, 5])

### 行列計算

In [43]:
a = np.array(np.random.randint(0, 10, (3, 3)))
b = np.array(np.random.randint(0, 10, (3, 3)))

In [44]:
a

array([[2, 0, 5],
       [9, 6, 4],
       [7, 3, 7]])

In [45]:
b

array([[7, 4, 6],
       [8, 8, 0],
       [0, 8, 3]])

In [46]:
a + b

array([[ 9,  4, 11],
       [17, 14,  4],
       [ 7, 11, 10]])

In [47]:
a * b

array([[14,  0, 30],
       [72, 48,  0],
       [ 0, 24, 21]])

In [48]:
# n次元配列も作れるが実務では登場機会は少ないか
ndarray_3D = np.zeros((3, 3, 3, 3))
ndarray_3D

array([[[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]],


       [[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]],


       [[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]]])

### 要素の抽出

**要素の番号で指定**

In [52]:
a = a.flatten()
a

array([2, 0, 5, 9, 6, 4, 7, 3, 7])

In [53]:
a[2]

5

**[行番号, 列番号]で指定**

In [71]:
c = np.arange(0, 48, 1).reshape(6, 8)
c

array([[ 0,  1,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29, 30, 31],
       [32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47]])

In [31]:
# 2行3列目
c[2, 3]

9

In [27]:
# 2行目(2個目の要素とも言える)
c[2]

array([5, 6, 3, 9, 3, 2, 5, 4])

In [29]:
# 2行3列目(2個目の要素の中の３個目の要素とも言える)
c[2][3]

9

**スライスで指定**

In [54]:
# 1次元の場合
a

array([2, 0, 5, 9, 6, 4, 7, 3, 7])

In [56]:
# 3から6個目まで取り出す(※6は含まれない)
a[3:6]

array([9, 6, 4])

In [65]:
# 3から全部
a[3:]

array([9, 6, 4, 7, 3, 7])

In [66]:
# 3まで全部
a[:3]

array([2, 0, 5])

In [67]:
# 全部
a[:]

array([2, 0, 5, 9, 6, 4, 7, 3, 7])

In [70]:
# 最後尾
a[-1]

7

In [78]:
# 2次元の場合
d = np.linspace(1, 100, 50).reshape(10, 5)
d

array([[  1.        ,   3.02040816,   5.04081633,   7.06122449,
          9.08163265],
       [ 11.10204082,  13.12244898,  15.14285714,  17.16326531,
         19.18367347],
       [ 21.20408163,  23.2244898 ,  25.24489796,  27.26530612,
         29.28571429],
       [ 31.30612245,  33.32653061,  35.34693878,  37.36734694,
         39.3877551 ],
       [ 41.40816327,  43.42857143,  45.44897959,  47.46938776,
         49.48979592],
       [ 51.51020408,  53.53061224,  55.55102041,  57.57142857,
         59.59183673],
       [ 61.6122449 ,  63.63265306,  65.65306122,  67.67346939,
         69.69387755],
       [ 71.71428571,  73.73469388,  75.75510204,  77.7755102 ,
         79.79591837],
       [ 81.81632653,  83.83673469,  85.85714286,  87.87755102,
         89.89795918],
       [ 91.91836735,  93.93877551,  95.95918367,  97.97959184,
        100.        ]])

In [79]:
d[2:4]

array([[21.20408163, 23.2244898 , 25.24489796, 27.26530612, 29.28571429],
       [31.30612245, 33.32653061, 35.34693878, 37.36734694, 39.3877551 ]])

In [80]:
# 注意
d[2:4][0:3]

array([[21.20408163, 23.2244898 , 25.24489796, 27.26530612, 29.28571429],
       [31.30612245, 33.32653061, 35.34693878, 37.36734694, 39.3877551 ]])

In [81]:
d[2:4, 0:3]

array([[21.20408163, 23.2244898 , 25.24489796],
       [31.30612245, 33.32653061, 35.34693878]])

### 参照渡しと値渡し

* listやnumpyを関数に渡して処理させると元のオブジェクトにも変更が加えられてしまうことがありバグの原因になりうる
* 頭の片隅に入れておくとよい
* listやnumpyを関数に食わせるときはcopy()しておくのが安全

https://datawokagaku.com/numpy_generate/

In [109]:
def change_hundred(array):

    array[0] = 100
    return array

def change_hundred_copy(array):
    
    array_copy = array.copy()
    array_copy[0] = 100
    return array_copy

In [110]:
# array1とarray2は同じ値
array_1 = np.arange(0, 4)
array_2 = np.arange(0, 4)

In [111]:
# 関数に入れると
output_array = change_hundred(array_1)
output_array_copy = change_hundred_copy(array_2)

In [112]:
# 別の値になってしまった
print(array_1)
print(array_2)

[100   1   2   3]
[0 1 2 3]


### 便利関数まとめ

|                                          |                                                                          | 
| ---------------------------------------- | ------------------------------------------------------------------------ | 
| .flatten()                               | 一列にする                                                               | 
| np.arange([start,] stop[, step])         | start以上stop未満の値をstepずつ増加させた値のarrayを生成                 | 
| np.linspace(start, stop, num=50)         | start以上stop未満の値を均等にnum個等分した値のarrayを生成                | 
| .copy()                                  | NumPy arrayをコピー                                                      | 
| np.zeros(shape)                          | 要素が全て０のndarrayを生成                                              | 
| np.ones(shape)                           | 要素が全て１のndarrayを生成                                              | 
| np.eye(N)                                | N x Nの単位行列を生成                                                    | 
| np.random.rand()                         | 0 ~ 1からランダムな数字で行列を生成                                      | 
| np.random.randn()                        | 標準正規分布から値をとって行列を生成                                     | 
| np.random.randint(low[, hight] [, size]) | low以上hight未満のintegerからランダムに，指定したsizeのndarrayを生成<br> | 
| .reshape(shape)                          | ndarrayのshapeを任意のshapeに変換                                        | 
|                                          |                                                                          | 
|                                          |                                                                          | 

# Pandas
* 表計算用のライブラリ
* 内部ではnumpyを使っているのでnumpyと扱い方は結構似てる部分もあるが、さらに便利な機能がたくさん搭載されている
* importが必要

In [114]:
import pandas as pd

## データのインポート

`pd.read_csv()` : csvファイルの読み込み

| パラメータ      | 値  | 説明                   | 
| --------------- | --- | ---------------------- | 
| file_path(必須) | str | 読み込むファイルのパス | 
| encoding        | str | エンコード             | 
| skip_rows        | int | 先頭行を読み込まない   | 

`pd.read_excel()` : excelファイルの読み込み

| パラメータ      | 値  | 説明                   | 
| --------------- | --- | ---------------------- | 
| file_path(必須) | str | 読み込むファイルのパス | 
| encoding        | str | エンコード             |
| skip_rows        | int | 先頭行を読み込まない   | 
|sheet_name       | str | シート指定             |

In [125]:
# American Comunities Survay(ACS):米国国税調査局によるデータ
df = pd.read_csv('./US_ACS_2017_10pct_sample.csv', encoding='sjis')

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,year,datanum,serial,cbserial,numprec,subsamp,hhwt,hhtype,cluster,adjust,...,migcounty1,migmet131,vetdisab,diffrem,diffphys,diffmob,diffcare,diffsens,diffeye,diffhear
0,2017,1,177686,2.017001e+12,9,64,55,"female householder, no husband present",2.017002e+12,1.011189,...,0,not in identifiable area,,,,,,no vision or hearing difficulty,no,no
1,2017,1,1200045,2.017001e+12,6,79,25,"male householder, no wife present",2.017012e+12,1.011189,...,0,not in identifiable area,,no cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
2,2017,1,70831,2.017000e+12,1 person record,36,57,"male householder, living alone",2.017001e+12,1.011189,...,0,not in identifiable area,,has cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
3,2017,1,557128,2.017001e+12,2,10,98,married-couple family household,2.017006e+12,1.011189,...,0,not in identifiable area,,no cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
4,2017,1,614890,2.017001e+12,4,96,54,married-couple family household,2.017006e+12,1.011189,...,0,not in identifiable area,,,,,,no vision or hearing difficulty,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318999,2017,1,734396,2.017001e+12,4,78,100,married-couple family household,2.017007e+12,1.011189,...,0,not in identifiable area,,no cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
319000,2017,1,586263,2.017001e+12,4,57,77,married-couple family household,2.017006e+12,1.011189,...,0,not in identifiable area,,,,,,no vision or hearing difficulty,no,no
319001,2017,1,510444,2.017001e+12,2,43,152,"female householder, no husband present",2.017005e+12,1.011189,...,0,not in identifiable area,,has cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
319002,2017,1,1220474,2.017001e+12,4,16,148,married-couple family household,2.017012e+12,1.011189,...,0,not in identifiable area,,no cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no


## データ分析
**ACSデータを使って賃金と学歴/人種の関係を分析**

### utils
**df操作** <br>
`df.columns` : カラムのリスト <br>
`df.index ` : インデックスのリスト <br>
`df.head()` : dfの先頭n行を抽出 <br>
`df.tail()` : dfの最後尾n行を抽出 <br>
`df.sample()` : ランダムにn行抽出 <br>
`df['column_name']` : column_name列抽出 <br>
`df.loc['column_name']` : column_name列抽出 <br>
`df.loc[mask]` : mask条件文がTrueなものを抽出 <br>
`df.iloc[index_num, column_num]` : 行列番号で抽出 <br>
`df.rename(columns={'previous_name':'new_name'})` : カラム名の変更 <br>
`df.set_index('column_name')` : column_name列をindexにする(時刻など) <br>
`df.reset_index()` : indexを0から振りなおす <br> <br>
**統計量** <br>
`df.describe()` : 統計量まとめ <br>
`seriese.value_counts` : 同じ値をカウント <br>
`df.groupby('column_name')` : column_name列の値でグルーピング <br> <br>
**欠損値** <br>
`df.isna()` : NaNの抽出 <br>
`df.dropna()` : NaNをドロップ <br>
`df.fillna()` : NaNに値を代入 <br>
`df.replace()` : ある値を別の値に置き換える(np.nan) <br> <br>
**その他** <br>
`pd.concat([df1, df2])` : df1とdf2の結合 <br>
`df1.merge(df2)` : df1とdf2の結合　キーや結合方法を指定可能

### 1. DataFrameの全体像確認

In [138]:
df.head()

Unnamed: 0,year,datanum,serial,cbserial,numprec,subsamp,hhwt,hhtype,cluster,adjust,...,migcounty1,migmet131,vetdisab,diffrem,diffphys,diffmob,diffcare,diffsens,diffeye,diffhear
0,2017,1,177686,2017001000000.0,9,64,55,"female householder, no husband present",2017002000000.0,1.011189,...,0,not in identifiable area,,,,,,no vision or hearing difficulty,no,no
1,2017,1,1200045,2017001000000.0,6,79,25,"male householder, no wife present",2017012000000.0,1.011189,...,0,not in identifiable area,,no cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
2,2017,1,70831,2017000000000.0,1 person record,36,57,"male householder, living alone",2017001000000.0,1.011189,...,0,not in identifiable area,,has cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
3,2017,1,557128,2017001000000.0,2,10,98,married-couple family household,2017006000000.0,1.011189,...,0,not in identifiable area,,no cognitive difficulty,no ambulatory difficulty,no independent living difficulty,no,no vision or hearing difficulty,no,no
4,2017,1,614890,2017001000000.0,4,96,54,married-couple family household,2017006000000.0,1.011189,...,0,not in identifiable area,,,,,,no vision or hearing difficulty,no,no


### 2. columnsの確認

In [126]:
# データが多い場合は全部は表示してくれない
df.columns

Index(['year', 'datanum', 'serial', 'cbserial', 'numprec', 'subsamp', 'hhwt',
       'hhtype', 'cluster', 'adjust',
       ...
       'migcounty1', 'migmet131', 'vetdisab', 'diffrem', 'diffphys', 'diffmob',
       'diffcare', 'diffsens', 'diffeye', 'diffhear'],
      dtype='object', length=104)

In [127]:
# for文でprint()すれば全部のカラムを確認できる
for c in df.columns:
    print(c)

year
datanum
serial
cbserial
numprec
subsamp
hhwt
hhtype
cluster
adjust
cpi99
region
stateicp
statefip
countyicp
countyfip
metro
city
citypop
strata
gq
farm
ownershp
ownershpd
mortgage
mortgag2
mortamt1
mortamt2
respmode
pernum
cbpernum
perwt
slwt
famunit
sex
age
marst
birthyr
race
raced
hispan
hispand
bpl
bpld
citizen
yrnatur
yrimmig
language
languaged
speakeng
hcovany
hcovpriv
hinsemp
hinspur
hinstri
hcovpub
hinscaid
hinscare
hinsva
hinsihs
school
educ
educd
gradeatt
gradeattd
schltype
degfield
degfieldd
degfield2
degfield2d
empstat
empstatd
labforce
occ
ind
classwkr
classwkrd
looking
availble
inctot
ftotinc
incwage
incbus00
incss
incwelfr
incinvst
incretir
incsupp
incother
incearn
poverty
migrate1
migrate1d
migplac1
migcounty1
migmet131
vetdisab
diffrem
diffphys
diffmob
diffcare
diffsens
diffeye
diffhear


### 3. データ成型
**平均所得(`columns = 'inctot'`)**

In [129]:
df['inctot'].describe()

count    3.190040e+05
mean     1.723646e+06
std      3.732326e+06
min     -9.000000e+03
25%      1.050000e+04
50%      3.370000e+04
75%      9.140000e+04
max      9.999999e+06
Name: inctot, dtype: float64

In [131]:
# それぞれの統計量を個別に抽出することも可
# 以下のような書き方もあるが非推奨(紹介のみ)
df.inctot.mean()

1723646.2703978634

In [132]:
df['inctot'].describe()['mean']

1723646.2703978634

**平均所得が高すぎる→max値がどうも怪しい**

In [133]:
df['inctot'].value_counts()

9999999    53901
0          33679
30000       4778
50000       4414
40000       4413
           ...  
70520          1
76680          1
57760          1
200310         1
505400         1
Name: inctot, Length: 8471, dtype: int64

**9999999が多すぎる** <br>
**→ このデータでは欠損地を9999999にしているぽい** <br>
**→ ドメイン知識によって怪しいデータを排除**

In [134]:
df['inctot'] = df['inctot'].replace(9999999, np.nan)

In [135]:
df['inctot'].value_counts()

0.0         33679
30000.0      4778
50000.0      4414
40000.0      4413
20000.0      4067
            ...  
246600.0        1
90810.0         1
341380.0        1
15790.0         1
505400.0        1
Name: inctot, Length: 8470, dtype: int64

In [136]:
df['inctot'].mean()

40890.177564946454

**それではいつ9999999が使われていたのかを特定する**

In [142]:
# dfの中でinctotがNaNの行を抽出しさらにそのなかのage列を抽出しその列に対してvalue_counts
# なれない場合は分解して実行するとよい
df.loc[df['inctot'].isnull()]['age'].value_counts()

10                      3997
9                       3977
14                      3847
12                      3845
13                      3800
11                      3791
8                       3648
7                       3527
6                       3524
5                       3512
2                       3405
1                       3340
4                       3318
3                       3220
less than 1 year old    3150
Name: age, dtype: int64

In [144]:
# ~でmaskのTrue/Falseを反転できる
df.loc[~df['inctot'].isnull()]['age'].value_counts()

60    4950
54    4821
59    4776
56    4776
58    4734
      ... 
93     476
95     471
92     355
91     227
96      10
Name: age, Length: 82, dtype: int64

**どうやら14歳以下のinctotを9999999にしていたぽい**

***さらに収入0もかなりの数あるため現在働いている人を対象として抽出***

In [145]:
df['empstat'].value_counts()

employed              148758
not in labor force    104676
unemployed              7727
Name: empstat, dtype: int64

In [146]:
df.loc[df['empstat']=='employed']['inctot'].mean()

57854.723914007984

### 4. 分析

**収入と人種(`columns='race'`)**

In [147]:
df['race'].value_counts()

white                               243751
black/african american/negro         31691
other asian or pacific islander      12508
other race, nec                      12304
two major races                       8826
chinese                               4313
american indian or alaska native      3595
three or more major races             1207
japanese                               809
Name: race, dtype: int64

***ここでは白人と日本人で比較***

In [149]:
# 複数条件のmaskは()で囲み, &(and), |(or)で記述
white = df.loc[(df['empstat']=='employed') & (df['race']=='white')]['inctot'].mean()
white

60473.15372747098

In [150]:
Japanese = df.loc[(df['empstat']=='employed') & (df['race']=='japanese')]['inctot'].mean()
Japanese

78906.74418604652

***groupbyで一度に算出***

In [152]:
# non_employed含む
df.groupby('race')['inctot'].mean()

race
american indian or alaska native    23992.854989
black/african american/negro        27427.134738
chinese                             47246.115591
japanese                            54477.486842
other asian or pacific islander     45444.672559
other race, nec                     23703.220545
three or more major races           34824.905660
two major races                     32277.964461
white                               43488.430019
Name: inctot, dtype: float64

In [153]:
# employedのみ
df.loc[df['empstat']=='employed'].groupby('race')['inctot'].mean()

race
american indian or alaska native    37996.522481
black/african american/negro        41747.949905
chinese                             72804.918567
japanese                            78906.744186
other asian or pacific islander     66647.736613
other race, nec                     34989.400521
three or more major races           49787.183099
two major races                     49021.151515
white                               60473.153727
Name: inctot, dtype: float64

* 演習 <br>
白人の中にはヒスパニック系アメリカ人も含まれるため、一般に想定される白人アメリカ人の給与が過小評価されてしまっている<br>
`hispan`列を用いて白人アメリカ人の平均給与を算出せよ

***groupbyの結果を用いて新たなDataFrameを生成***

In [156]:
df1 = df.groupby('race')['inctot'].mean()
df2 = df.loc[df['empstat']=='employed'].groupby('race')['inctot'].mean()

In [158]:
# axisで行方向か列方向か指定
df_concat = pd.concat([df1, df2], axis=1)
df_concat

Unnamed: 0_level_0,inctot,inctot
race,Unnamed: 1_level_1,Unnamed: 2_level_1
american indian or alaska native,23992.854989,37996.522481
black/african american/negro,27427.134738,41747.949905
chinese,47246.115591,72804.918567
japanese,54477.486842,78906.744186
other asian or pacific islander,45444.672559,66647.736613
"other race, nec",23703.220545,34989.400521
three or more major races,34824.90566,49787.183099
two major races,32277.964461,49021.151515
white,43488.430019,60473.153727


In [161]:
# columnの変更(rename()でもよいが今回はどちらも同じ名前だったので使えなさそう)
df_concat.columns = ['all', 'only_employed']
df_concat

Unnamed: 0_level_0,all,only_employed
race,Unnamed: 1_level_1,Unnamed: 2_level_1
american indian or alaska native,23992.854989,37996.522481
black/african american/negro,27427.134738,41747.949905
chinese,47246.115591,72804.918567
japanese,54477.486842,78906.744186
other asian or pacific islander,45444.672559,66647.736613
"other race, nec",23703.220545,34989.400521
three or more major races,34824.90566,49787.183099
two major races,32277.964461,49021.151515
white,43488.430019,60473.153727


### 5. その他

**pd.merge()の使い方**

In [198]:
# ダミーデータ生成
id=[]
for c in list(map(str, range(0, len(df_concat.index)+3))):
    c  = c.zfill(6)
    id.append(c)

import random

ID = pd.DataFrame()
ID['id'] = id
ID['race'] = list(df_concat.index) + ['indian', 'korean', 'arab']

ID

Unnamed: 0,id,race
0,0,american indian or alaska native
1,1,black/african american/negro
2,2,chinese
3,3,japanese
4,4,other asian or pacific islander
5,5,"other race, nec"
6,6,three or more major races
7,7,two major races
8,8,white
9,9,indian


**ACSのデータに対して、人種ごとにidを振り分けたいとする <br>
具体的には、ACSとIDデータを見比べて同じraceならばACSのrace_id列にidを入力するという作業 <br>
→raceをキーにしてマージすればよい**

`pd.merge()` : データフレームの結合

| パラメータ      | 値  | 説明                   | 
| --------------- | --- | ---------------------- | 
| df1(必須)       | DataFrame | 結合したいDataFrame | 
| df2(必須)       | DataFrame | 結合したいDataFrame | 
| on              | str       | key             | 
| how             | str: 'inner'/'outer'/'left'/'right' | 先頭行を読み込まない   |
| indicator       | bool | マージ情報列を追加するかどうか

In [201]:
# ACSデータは列が多すぎてわかりにくいので削減したものを使う
df = df[['year', 'serial', 'inctot', 'age', 'race', 'empstat']]
df

Unnamed: 0,year,serial,inctot,age,race,empstat
0,2017,177686,,4,white,
1,2017,1200045,6000.0,17,white,employed
2,2017,70831,6150.0,63,white,employed
3,2017,557128,14000.0,66,white,not in labor force
4,2017,614890,,1,white,
...,...,...,...,...,...,...
318999,2017,734396,22130.0,33,white,employed
319000,2017,586263,,4,white,
319001,2017,510444,5000.0,20,two major races,employed
319002,2017,1220474,240000.0,47,other asian or pacific islander,employed


In [203]:
merged_data = pd.merge(df, ID, on='race', how='outer', indicator=True)
merged_data

Unnamed: 0,year,serial,inctot,age,race,empstat,id,_merge
0,2017.0,177686.0,,4,white,,000008,both
1,2017.0,1200045.0,6000.0,17,white,employed,000008,both
2,2017.0,70831.0,6150.0,63,white,employed,000008,both
3,2017.0,557128.0,14000.0,66,white,not in labor force,000008,both
4,2017.0,614890.0,,1,white,,000008,both
...,...,...,...,...,...,...,...,...
319002,2017.0,400992.0,82000.0,56,three or more major races,employed,000006,both
319003,2017.0,630504.0,0.0,29,three or more major races,not in labor force,000006,both
319004,,,,,indian,,000009,right_only
319005,,,,,korean,,000010,right_only


**新しく`_merge`列が追加されている** <br>
* both => keyが両方のデータフレームにあった<br>
* right/left_only => keyがright/leftにしかなかった

In [204]:
merged_data.loc[merged_data['_merge'] != 'both']

Unnamed: 0,year,serial,inctot,age,race,empstat,id,_merge
319004,,,,,indian,,9,right_only
319005,,,,,korean,,10,right_only
319006,,,,,arab,,11,right_only


**indian, korean, arabはACCデータに無いためこの結果となる** <br>
→howの与え方によって結合の方法を変更可能

In [207]:
merged_data = pd.merge(df, ID, on='race', how='inner', indicator=True)
merged_data.loc[merged_data['_merge'] != 'both']

Unnamed: 0,year,serial,inctot,age,race,empstat,id,_merge


**How to merge** <br>
![how to merge](https://www.practicaldatascience.org/html/_images/join-or-merge-in-python-pandas.png)