In [6]:
import pandas as pd
import numpy as np
from ecomplexity import ecomplexity

# 小数点以下 桁数 6
pd.options.display.float_format = '{:.6f}'.format


In [7]:
global data_dir, output_dir
data_dir = '../../data/interim/internal/filtered_before_agg/'
output_dir = '../../data/interim/internal/filtered_after_agg/'


In [8]:
def weight_by_ipc(reg_num_df: pd.DataFrame):
    ipc_weight_df = reg_num_df.copy()\
                               .groupby(['right_person_name', 'reg_num'])\
                               [['ipc_class']].nunique().reset_index(drop=False)\
                               .rename(columns={'ipc_class':'weight'})
    ipc_weight_df['weight'] = round(1 / ipc_weight_df['weight'], 2)

    weighted_reg_num_df = pd.merge(reg_num_df, ipc_weight_df, 
                                   on=['reg_num', 'right_person_name'], 
                                   how='left')
    weighted_reg_num_df = weighted_reg_num_df.drop_duplicates()\
                                .groupby(['right_person_name', 'ipc_class'])[['weight']].sum()\
                                .sort_values('weight', ascending=False)\
                                .reset_index(drop=False)\
                                .rename(columns={'weight':'reg_num'})
    return weighted_reg_num_df


In [9]:
year_start = 1981
year_end = 2010
year_range = 10


ar = 'app'
ipc_digit = 4
weight = 'fraction' # fraction or duplication
how_extract = 'all' # all or sep_year
top_p_or_num = ('num', 1000) # (p or num, int)


In [10]:
# 上位何%までを抽出するか
def extract_top(reg_num_df: pd.DataFrame, 
                top_p_or_num: tuple):
    top_right_person_list = reg_num_df.copy().groupby(['right_person_name'])[['reg_num']].sum()\
                                               .sort_values(['reg_num'], ascending=False)\
                                               .reset_index(drop=False)\
                                               .head((reg_num_df['right_person_name'].nunique()*top_p//100)+1)\
                                               ['right_person_name'].to_list()
    
    reg_num_top_p_df = reg_num_df[reg_num_df['right_person_name'].isin(top_right_person_list)].copy()
    # reg_num_top_p_df['segment'] = reg_num_top_p_df['period'].str[:4].astype(np.int64)
    return reg_num_top_p_df
    # sep_year_top_p_df_dict[p][period] = sep_year_top_p_df.copy()
    

In [11]:
# 全体
all_df = pd.read_csv(f'{data_dir}{ar}.csv', 
                     encoding='utf-8', 
                     sep=',', 
                     usecols=['reg_num', 
                              'right_person_name', 
                              f'{ar}_year', 
                              'ipc_class'], 
                     dtype={'reg_num':str, 
                            'right_person_name':str, 
                            f'{ar}_year':np.int64, 
                            'ipc_class':str})

all_df['ipc_class'] = all_df['ipc_class'].str[:ipc_digit]
all_df = all_df[all_df[f'{ar}_year'].isin(range(year_start, year_end+1))]\
               .drop_duplicates()\

print('特許数（次数削減前）:', all_df['reg_num'].nunique())
print('特許権者（次数削減前）:', all_df['right_person_name'].nunique())
print('IPCクラス（次数削減前）:', all_df['ipc_class'].nunique())
display(all_df.head())


# 各期間
sep_year_df_dict = {}

for year in range(year_start, year_end+1, year_range):
    sep_year_df_dict[f'{year}-{year+year_range-1}'] = all_df[all_df[f'{ar}_year'].isin(range(year, year+year_range))]
    print(f'=============={year}-{year+year_range-1}==============')
    print('特許数（次数削減前）:', sep_year_df_dict[f'{year}-{year+year_range-1}']['reg_num'].nunique())
    print('特許権者（次数削減前）:', sep_year_df_dict[f'{year}-{year+year_range-1}']['right_person_name'].nunique())
    print('IPCクラス（次数削減前）:', sep_year_df_dict[f'{year}-{year+year_range-1}']['ipc_class'].nunique())
    print('=====================================\n')


特許数（次数削減前）: 3189395
特許権者（次数削減前）: 73393
IPCクラス（次数削減前）: 630


Unnamed: 0,reg_num,right_person_name,app_year,ipc_class
0,5684492,ＤＲＣ合同会社,2010,G10H
1,5684512,株式会社ＩＨＩエアロスペース,2010,B62D
2,5684598,株式会社オカムラ,2010,A47C
3,5684620,三井化学株式会社,2010,H01M
12,5684736,シロキ工業株式会社,2010,B60N


特許数（次数削減前）: 832268
特許権者（次数削減前）: 28148
IPCクラス（次数削減前）: 613

特許数（次数削減前）: 1058468
特許権者（次数削減前）: 36819
IPCクラス（次数削減前）: 616

特許数（次数削減前）: 1298659
特許権者（次数削減前）: 38096
IPCクラス（次数削減前）: 617



In [None]:
# 全体
if weight == 'fraction':
    all_reg_num_df = weight_by_ipc(all_df)
else:
    all_reg_num_df = all_df.groupby(['right_person_name', 'ipc_class'])[['reg_num']].nunique().reset_index(drop=False)
all_reg_num_df['segment'] = f'{year_start}-{year_end}'


# 期間ごと
sep_year_reg_num_df_dict = {}
sep_year_reg_num_top_p_df_dict = {}
for period, sep_year_df in sep_year_df_dict.items():
    if weight == 'fraction':
        sep_year_reg_num_df_dict[period] = weight_by_ipc(sep_year_df)
    else:
        all_reg_num_df = sep_year_df.groupby(['right_person_name', 'ipc_class'])[['reg_num']].nunique().reset_index(drop=False)
    sep_year_reg_num_df_dict[period]['segment'] = period


In [None]:

all_reg_num_top_p_df = extract_top(all_reg_num_df, 
                                   top_p_or_num)

display(all_reg_num_top_p_df.head())
sep_year_reg_num_top_p_df_dict[period] = extract_top(sep_year_reg_num_df_dict[period], top_p)

sep_year_reg_num_top_p_df = pd.concat(sep_year_reg_num_top_p_df_dict.values(), 
                                      axis='index')

display(sep_year_reg_num_top_p_df)


In [None]:
print('特許数（次数削減前）:', all_reg_num_top_p_df.groupby('right_person_name')['reg_num'].sum().reset_index(drop=False)['reg_num'].sum())
print('特許権者（次数削減前）:', all_reg_num_top_p_df['right_person_name'].nunique())
print('IPCクラス（次数削減前）:', all_reg_num_top_p_df['ipc_class'].nunique())
display(all_reg_num_top_p_df.head())


# 各期間
# sep_year_df_dict = {}

# for year in range(year_start, year_end+1, year_range):
#     sep_year_df_dict[f'{year}-{year+year_range-1}'] = all_df[all_df[f'{ar}_year'].isin(range(year, year+year_range))]
#     print(f'=============={year}-{year+year_range-1}==============')
#     print('特許数（次数削減前）:', sep_year_df_dict[f'{year}-{year+year_range-1}']['reg_num'].nunique())
#     print('特許権者（次数削減前）:', sep_year_df_dict[f'{year}-{year+year_range-1}']['right_person_name'].nunique())
#     print('IPCクラス（次数削減前）:', sep_year_df_dict[f'{year}-{year+year_range-1}']['ipc_class'].nunique())
#     print('=====================================\n')
