<a id=top></a>

# **目次**

<b>
    <details>
        <summary>
            <a href="#modules", style="font-size: xx-large">1. モジュールインポート</a>
            <ul>※サードパーティライブラリ>>>自作モジュール>>>（ここまで本ipynb外）>>>自作関数（本ipynb内）</ul>
        </summary>
    </details>
    <details>
        <summary>
            <a href="#data", style="font-size: xx-large">2. オリジナルデータインポート</a>
        </summary>
    </details>
    <details>
        <summary>
            <a href="#patentcount", style="font-size: xx-large">3. 特許数</a>
        </summary>
        <table></table>
    </details>
    <details>
        <summary>
            <a href="#calculateindicator", style="font-size: xx-large">4. 各指標</a>
        </summary>
    </details>
    <details>
        <summary>
            <a href="#output", style="font-size: xx-large">5. ファイルに出力</a>
        </summary>
    </details>
</b>


---


<a id=modules></a>

## **1. モジュールインポート**


In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../../src')
from ecomplexity import ecomplexity

# 小数点以下 桁数 6
pd.options.display.float_format = '{:.3f}'.format


In [2]:
import initial_condition


In [24]:
global data_dir, output_dir
data_dir = '../../data/interim/internal/filtered_after_agg/'
output_dir = '../../data/processed/internal/'
ex_dir = '../../data/processed/external/schmoch/'


In [4]:
# 初期条件
ar = initial_condition.AR
year_style = initial_condition.YEAR_STYLE

year_start = initial_condition.YEAR_START
year_end = initial_condition.YEAR_END
year_range = initial_condition.YEAR_RANGE

classification = initial_condition.CLASSIFICATION
class_weight = initial_condition.CLASS_WEIGHT
applicant_weight = initial_condition.APPLICANT_WEIGHT

extract_population = initial_condition.EXTRACT_POPULATION
top_p_or_num = initial_condition.TOP_P_OR_NUM


In [5]:
top_p_or_num

('num', 100)

In [6]:
def kh_ki(c_df, classification, n=19):
    kh1_ki1_df = pd.merge(c_df.copy(), 
                        c_df[c_df['mcp']==1].groupby(['right_person_name'])[['ubiquity']].sum().reset_index(drop=False).copy().rename(columns={'ubiquity':'kh_1'}), 
                        on=['right_person_name'], how='left')
    kh1_ki1_df = pd.merge(kh1_ki1_df.copy(), 
                        c_df[c_df['mcp']==1].groupby([classification])[['diversity']].sum().reset_index(drop=False).copy().rename(columns={'diversity':'ki_1'}), 
                        on=[classification], how='left')
    kh1_ki1_df['kh_1'] = kh1_ki1_df['kh_1'] / kh1_ki1_df['diversity']
    kh1_ki1_df['ki_1'] = kh1_ki1_df['ki_1'] / kh1_ki1_df['ubiquity']
    kh_ki_df = kh1_ki1_df.copy()
    for i in range(n):
        kh_ki_df = pd.merge(kh_ki_df, 
                            kh_ki_df[kh_ki_df['mcp']==1].groupby(['right_person_name'])[[f'ki_{i+1}']].sum().reset_index(drop=False).copy()\
                                        .rename(columns={f'ki_{i+1}':f'kh_{i+2}'}), 
                            on=['right_person_name'], how='left')
        kh_ki_df = pd.merge(kh_ki_df, 
                            kh_ki_df[kh_ki_df['mcp']==1].groupby([classification])[[f'kh_{i+1}']].sum().reset_index(drop=False).copy()\
                                        .rename(columns={f'kh_{i+1}':f'ki_{i+2}'}), 
                            on=[classification], how='left')
        kh_ki_df[f'kh_{i+2}'] = kh_ki_df[f'kh_{i+2}'] / kh_ki_df['diversity']
        kh_ki_df[f'ki_{i+2}'] = kh_ki_df[f'ki_{i+2}'] / kh_ki_df['ubiquity']
    return kh_ki_df


---


<a id=data></a>

## **2. オリジナルデータインポート**


In [7]:
reg_num_top_df = pd.read_csv(f'{data_dir}{ar}_{year_style}_{extract_population}_{top_p_or_num[0]}_{top_p_or_num[1]}.csv', 
                             encoding='utf-8',
                             sep=',')
reg_num_top_df

Unnamed: 0,app_year_period,right_person_name,schmoch35,reg_num
0,1981-2010,キヤノン株式会社,9,23609.000
1,1981-2010,キヤノン株式会社,28,9938.833
2,1981-2010,キヤノン株式会社,2,9424.667
3,1981-2010,キヤノン株式会社,6,7389.333
4,1981-2010,キヤノン株式会社,3,6325.500
...,...,...,...,...
11723,2001-2010,株式会社ブリヂストン,16,0.333
11724,2001-2010,三菱自動車工業株式会社,28,0.250
11725,2001-2010,日本電信電話株式会社,28,0.250
11726,2001-2010,株式会社ＮＴＴドコモ,29,0.250


In [8]:
reg_num_top_df['right_person_name'].nunique()

100

<a href=#top>先頭に戻る</a>

---


<a id=calculateindicator></a>

## **4. 各指標**


In [9]:
trade_cols = {'time':f'{ar}_{year_style}_period', 'loc':'right_person_name', 'prod':classification, 'val':'reg_num'}
rename_col_dict = {'eci':'kci', 'pci':'tci'}
col_order_list = [f'{ar}_{year_style}_period', 'right_person_name', classification, 'reg_num', 'rca', 'mcp', 'diversity', 'ubiquity', 'kci', 'tci']


In [10]:
c_df = ecomplexity(reg_num_top_df,
                   cols_input = trade_cols, 
                   rca_mcp_threshold = 1)
c_df = c_df[c_df['reg_num'] > 0]\
           .rename(columns=rename_col_dict)\
           [col_order_list]
c_df = pd.concat([kh_ki(c_df[c_df[f'{ar}_{year_style}_period'] == period], classification) for period in c_df[f'{ar}_{year_style}_period'].unique()], 
                 axis='index', 
                 ignore_index=True)

# for segment in c_df[f'{ar}_{year_style}_period'].unique():
#     display(c_df[c_df[f'{ar}_{year_style}_period'] == segment].head())
#     display(c_df[c_df[f'{ar}_{year_style}_period'] == segment].describe())
#     print(c_df[c_df[f'{ar}_{year_style}_period'] == segment].info())
#     print('\n')


1981-2010
1981-1990
1991-2000
2001-2010


In [11]:
c_df.sort_values(by=[f'{ar}_{year_style}_period', 'kci'], ascending=[True, False])

Unnamed: 0,app_year_period,right_person_name,schmoch35,reg_num,rca,mcp,diversity,ubiquity,kci,tci,...,kh_16,ki_16,kh_17,ki_17,kh_18,ki_18,kh_19,ki_19,kh_20,ki_20
4133,1981-1990,住友化学株式会社,1,16.000,0.066,0,8,22,1.229,-0.522,...,9.294,24.940,25.021,9.249,9.281,24.953,25.002,9.254,9.273,24.960
4134,1981-1990,住友化学株式会社,2,12.500,0.041,0,8,24,1.229,-1.678,...,9.294,24.891,25.021,9.230,9.281,24.922,25.002,9.242,9.273,24.942
4135,1981-1990,住友化学株式会社,6,0.500,0.002,0,8,17,1.229,-2.225,...,9.294,24.866,25.021,9.220,9.281,24.907,25.002,9.236,9.273,24.932
4136,1981-1990,住友化学株式会社,8,9.000,0.050,0,8,21,1.229,-1.663,...,9.294,24.891,25.021,9.230,9.281,24.923,25.002,9.242,9.273,24.942
4137,1981-1990,住友化学株式会社,9,134.500,0.440,0,8,20,1.229,-0.752,...,9.294,24.933,25.021,9.247,9.281,24.948,25.002,9.252,9.273,24.957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11329,2001-2010,株式会社ＮＴＴドコモ,29,0.250,0.003,0,6,33,-2.039,1.197,...,9.457,25.991,25.981,9.477,9.464,25.989,25.983,9.475,9.468,25.988
11330,2001-2010,株式会社ＮＴＴドコモ,30,0.500,0.007,0,6,16,-2.039,0.497,...,9.457,25.987,25.981,9.472,9.464,25.987,25.983,9.472,9.468,25.987
11331,2001-2010,株式会社ＮＴＴドコモ,32,2.000,0.009,0,6,21,-2.039,1.630,...,9.457,25.988,25.981,9.473,9.464,25.988,25.983,9.473,9.468,25.988
11332,2001-2010,株式会社ＮＴＴドコモ,33,2.000,0.020,0,6,8,-2.039,-0.467,...,9.457,25.983,25.981,9.468,9.464,25.985,25.983,9.470,9.468,25.986


In [12]:
c_df[(c_df[classification]==22)&(c_df[f'{ar}_{year_style}_period']=='1981-1990')]

Unnamed: 0,app_year_period,right_person_name,schmoch35,reg_num,rca,mcp,diversity,ubiquity,kci,tci,...,kh_16,ki_16,kh_17,ki_17,kh_18,ki_18,kh_19,ki_19,kh_20,ki_20
5104,1981-1990,株式会社クラレ,22,0.5,217.495,1,14,1,0.857,1.292,...,9.283,25.026,25.004,9.283,9.275,25.004,24.991,9.275,9.269,24.991


<a href="#top">先頭に戻る</a>

---


<a id=output></a>

## **5. ファイルに出力**

<a id=rightperson></a>

### **5.1. 特許権者**


In [13]:

right_person_df = pd.merge(c_df.groupby([f'{ar}_{year_style}_period', 'right_person_name'])[['reg_num']].sum().reset_index(drop=False), 
                           c_df.groupby([f'{ar}_{year_style}_period', 'right_person_name'])[[classification]].nunique().reset_index(drop=False), 
                           on=[f'{ar}_{year_style}_period', 'right_person_name'], 
                           how='inner')
right_person_df = pd.merge(right_person_df, 
                           c_df[[f'{ar}_{year_style}_period', 'right_person_name', 'diversity', 'kci']\
                               +[f'kh_{i}' for i in range(1, 20+1)]]\
                               .drop_duplicates(keep='first'), 
                           on=[f'{ar}_{year_style}_period', 'right_person_name'], 
                           how='inner')
# for period in right_person_df[f'{ar}_{year_style}_period'].unique():
#     for i in range(1, 20+1):
#         value = right_person_df[right_person_df[f'{ar}_{year_style}_period']==period]
#         right_person_df[right_person_df[f'{ar}_{year_style}_period']==period][f'kh_{i}'] = (value[f'kh_{i}'] - value[f'kh_{i}'].mean()) / value[f'kh_{i}'].std()
#     display(right_person_df[right_person_df[f'{ar}_{year_style}_period'] == period].head())
#     display(right_person_df[right_person_df[f'{ar}_{year_style}_period'] == period].describe())
#     print(right_person_df[right_person_df[f'{ar}_{year_style}_period'] == period].info())
#     print('\n')
# right_person_df['reg_num'] = right_person_df['reg_num'].astype(np.int64)

In [14]:
right_person_df.to_csv(f'{output_dir}firms/{ar}_{year_style}_{top_p_or_num[0]}_{top_p_or_num[1]}.csv', 
                       encoding='utf-8', 
                       sep=',', 
                       index=False)


<a href=#top>先頭に戻る</a>

---


<a id=ipc></a>

### **5.2. IPC**


In [15]:
# 各期間
classification_df = pd.merge(c_df.groupby([f'{ar}_{year_style}_period', classification])[['reg_num']].sum().reset_index(drop=False), 
                        c_df.groupby([f'{ar}_{year_style}_period', classification])[['right_person_name']].nunique().reset_index(drop=False), 
                        on=[f'{ar}_{year_style}_period', classification], 
                        how='inner')
classification_df = pd.merge(classification_df, 
                      c_df[[f'{ar}_{year_style}_period', classification, 'ubiquity', 'tci']\
                          +[f'ki_{i}' for i in range(1, 20+1)]]\
                          .drop_duplicates(keep='first'), 
                      on=[f'{ar}_{year_style}_period', classification], 
                      how='inner')
# classification_df['reg_num'] = classification_df['reg_num'].astype(np.int64)
display(classification_df)


Unnamed: 0,app_year_period,schmoch35,reg_num,right_person_name,ubiquity,tci,ki_1,ki_2,ki_3,ki_4,...,ki_11,ki_12,ki_13,ki_14,ki_15,ki_16,ki_17,ki_18,ki_19,ki_20
0,1981-1990,1,35671.136,97,22,-0.522,8.455,24.223,8.770,24.341,...,9.202,24.884,9.226,24.919,9.241,24.940,9.249,24.953,9.254,24.960
1,1981-1990,2,44438.826,94,24,-1.678,7.042,22.863,8.226,23.511,...,9.126,24.756,9.178,24.840,9.210,24.891,9.230,24.922,9.242,24.942
2,1981-1990,3,19821.649,86,18,-2.469,7.111,21.206,7.926,22.779,...,9.064,24.661,9.140,24.781,9.187,24.855,9.216,24.901,9.234,24.928
3,1981-1990,4,3674.627,56,12,-2.611,8.167,20.701,8.063,22.568,...,9.053,24.643,9.133,24.771,9.183,24.849,9.214,24.897,9.232,24.926
4,1981-1990,5,10094.417,71,23,-2.362,6.957,21.679,7.987,22.828,...,9.071,24.672,9.145,24.789,9.190,24.860,9.218,24.903,9.235,24.930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2001-2010,31,12699.383,94,22,1.610,8.136,25.256,9.011,25.808,...,9.468,25.991,9.473,25.990,9.474,25.989,9.474,25.988,9.473,25.988
136,2001-2010,32,36925.528,90,21,1.630,6.857,24.976,8.758,25.690,...,9.461,25.988,9.469,25.989,9.472,25.988,9.473,25.988,9.473,25.988
137,2001-2010,33,16207.750,85,8,-0.467,7.500,20.357,8.703,24.267,...,9.436,25.965,9.454,25.978,9.463,25.983,9.468,25.985,9.470,25.986
138,2001-2010,34,7859.833,86,16,-0.451,9.250,23.883,9.308,25.488,...,9.462,25.980,9.466,25.984,9.469,25.985,9.470,25.986,9.471,25.986


In [16]:
classification_df.to_csv(f'{output_dir}technology/{ar}_{year_style}_{top_p_or_num[0]}_{top_p_or_num[1]}.csv', 
                        encoding='utf-8', 
                        sep=',', 
                        index=False)


In [27]:
schmoch_df = pd.read_csv(f'{ex_dir}35.csv', 
                         encoding='utf-8', 
                         sep=',', 
                         usecols=['Field_number', 'Field_en']
                         ).drop_duplicates()

In [28]:
schmoch_df

Unnamed: 0,Field_number,Field_en
0,1,"Electrical machinery, apparatus, energy"
30,2,Audio-visual technology
48,3,Telecommunications
58,4,Digital communication
61,5,Basic communication processes
71,6,Computer technology
88,7,IT methods for management
89,8,Semiconductors
91,9,Optics
101,10,Measurement


<a href=#top>先頭に戻る</a>

---


<a id=network></a>

## **5.3. 二部グラフ用**


In [34]:
eneos_df = c_df[(c_df[f'{ar}_{year_style}_period']==f'{year_start}-{year_end}')&(c_df['right_person_name'].str.contains('ＥＮＥＯＳ'))\
                &(c_df['mcp']==1)].copy()#[['right_person_name', 'reg_num', 'schmoch35']].copy()
eneos_df = pd.merge(eneos_df, 
                    schmoch_df.rename(columns={'Field_number':'schmoch35'})\
                              .drop_duplicates(keep='first'), 
                    on=['schmoch35'], 
                    how='inner')
eneos_df[['ubiquity', 'Field_en', 'ki_1']]

['Analysis of biological materials',
 'Organic fine chemistry',
 'Biotechnology',
 'Pharmaceuticals',
 'Macromolecular chemistry, polymers',
 'Food chemistry',
 'Basic materials chemistry',
 'Materials, metallurgy',
 'Surface technology, coating',
 'Chemical engineering',
 'Other special machines']

In [None]:
c_df[c_df['']]

In [17]:
# graph_df = pd.concat([c_df, c_df], axis='index')
# graph_df = graph_df[graph_df['mcp']==1][[f'{ar}_{year_style}', 'right_person_name', 'ipc_class', 'mcp']]
# graph_df

In [18]:
# graph_df.to_csv(f'../Data/0_Graph/{ar}_{year_start}_{year_end}.csv', 
#                 encoding='utf-8', 
#                 sep=',', 
#                 index=False)
# graph_df
