In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../../src')
from ecomplexity import ecomplexity

# 小数点以下 桁数 6
pd.options.display.float_format = '{:.3f}'.format


In [3]:
import initial_condition
from analysis import weight


In [4]:
global data_dir, filter_dir, output_dir
data_dir = '../../data/interim/internal/filtered_before_agg/'
filter_dir = '../../data/interim/internal/filter_after_agg/'
output_dir = '../../data/interim/internal/filtered_after_agg/'


In [5]:
# 初期条件
ar = initial_condition.AR
year_style = initial_condition.YEAR_STYLE

year_start = initial_condition.YEAR_START
year_end = initial_condition.YEAR_END
year_range = initial_condition.YEAR_RANGE

classification = initial_condition.CLASSIFICATION
class_weight = initial_condition.CLASS_WEIGHT
applicant_weight = initial_condition.APPLICANT_WEIGHT

extract_population = initial_condition.EXTRACT_POPULATION
top_p_or_num = initial_condition.TOP_P_OR_NUM


In [6]:
# 全体
all_df = pd.read_csv(f'{data_dir}addedclassification.csv', 
                     encoding='utf-8', 
                     sep=',', 
                     usecols=['reg_num', 
                              'right_person_name', 
                              f'{ar}_{year_style}', 
                              f'{classification}'], 
                     dtype={'reg_num': str, 
                            'right_person_name': str, 
                            f'{ar}_{year_style}': np.int64, 
                            f'{classification}': str})

all_df = all_df[all_df[f'{ar}_{year_style}'].isin(range(year_start, year_end+1))]\
               .drop_duplicates()\
# display(all_df.head())


# 各期間
sep_year_df_dict = {}

for year in range(year_start, year_end+1, year_range):
    sep_year_df_dict[f'{year}-{year+year_range-1}'] = all_df[all_df[f'{ar}_year'].isin(range(year, year+year_range))]


In [7]:
# 特許分類による重みづけ
# 全体
if class_weight == 'fraction':
    all_df = weight.by_classification(all_df, classification)
elif class_weight == 'duplication':
    all_df['class_weight'] = 1
all_df[f'{ar}_{year_style}_period'] = f'{year_start}-{year_end}'


# 期間ごと
# sep_year_df_dict = {}
# sep_year_reg_num_top_df_dict = {}
for period, sep_year_df in sep_year_df_dict.items():
    if class_weight == 'fraction':
        sep_year_df_dict[period] = weight.by_classification(sep_year_df, classification)
    elif class_weight == 'duplication':
        sep_year_df_dict[period] = sep_year_df.groupby(['right_person_name', classification])[['reg_num']].nunique().reset_index(drop=False)
    sep_year_df_dict[period][f'{ar}_{year_style}_period'] = period

# 共同出願の重みづけ
# 全体
if applicant_weight == 'fraction':
    all_df = weight.by_applicant(all_df)
elif applicant_weight == 'duplication':
    all_df['applicant_weight'] = 1
all_df[f'{ar}_{year_style}_period'] = f'{year_start}-{year_end}'


# 期間ごと
# sep_year_df_dict = {}
# sep_year_reg_num_top_df_dict = {}
for period, sep_year_df in sep_year_df_dict.items():
    if applicant_weight == 'fraction':
        sep_year_df_dict[period] = weight.by_applicant(sep_year_df)
    elif applicant_weight == 'duplication':
        sep_year_df_dict[period]['applicant_weight'] = 1
    sep_year_df_dict[period][f'{ar}_{year_style}_period'] = period



In [8]:
all_reg_num_df = all_df.copy()
all_reg_num_df['reg_num'] = 1 / all_reg_num_df['class_weight'] / all_reg_num_df['applicant_weight']
all_reg_num_df = all_reg_num_df.groupby([f'{ar}_{year_style}_period', 'right_person_name', classification])[['reg_num']]\
                               .sum().reset_index(drop=False)\
                               .sort_values(['reg_num'], ascending=[False])
sep_year_reg_num_df_dict = sep_year_df_dict.copy()
for period, sep_year_reg_num_df in sep_year_reg_num_df_dict.items():
    sep_year_reg_num_df['reg_num'] = 1 / sep_year_reg_num_df['class_weight'] / sep_year_reg_num_df['applicant_weight']
    sep_year_reg_num_df = sep_year_reg_num_df.groupby([f'{ar}_{year_style}_period', 'right_person_name', classification])[['reg_num']]\
                                             .sum().reset_index(drop=False)\
                                             .sort_values(['reg_num'], ascending=[False])
    sep_year_reg_num_df_dict[period] = sep_year_reg_num_df
sep_year_reg_num_df = pd.concat([sep_year_reg_num_df for sep_year_reg_num_df in sep_year_reg_num_df_dict.values()], axis='index', ignore_index=True)
sep_year_reg_num_df


Unnamed: 0,app_year_period,right_person_name,schmoch35,reg_num
0,1981-1990,パナソニツクホールデイングス株式会社,1,9358.286
1,1981-1990,パナソニツクホールデイングス株式会社,2,8889.486
2,1981-1990,キヤノン株式会社,9,7159.833
3,1981-1990,富士通株式会社,6,6335.750
4,1981-1990,コニカミノルタ株式会社,9,5721.250
...,...,...,...,...
230618,2001-2010,日本原子力発電株式会社,9,0.071
230619,2001-2010,北海道電力株式会社,31,0.071
230620,2001-2010,日本原燃株式会社,9,0.071
230621,2001-2010,有限会社テーエスピー,31,0.071


In [9]:
# フィルタリング
reg_num_filter_df = pd.read_csv(f'{filter_dir}{ar}_{year_style}_{extract_population}_reg_num_top_{top_p_or_num[0]}_{top_p_or_num[1]}.csv',
                                encoding='utf-8',
                                sep=',', 
                                usecols=[f'{ar}_{year_style}_period', 'right_person_name'],
                                dtype=str)
reg_num_filter_df

Unnamed: 0,app_year_period,right_person_name
0,1981-2010,パナソニツクホールデイングス株式会社
1,1981-2010,キヤノン株式会社
2,1981-2010,株式会社東芝
3,1981-2010,日本電気株式会社
4,1981-2010,三菱電機株式会社
...,...,...
1924,1981-2010,大成ロテツク株式会社
1925,1981-2010,アーキヤマデ株式会社
1926,1981-2010,大成プラス株式会社
1927,1981-2010,長田電機工業株式会社


In [10]:

if extract_population == 'all':
    all_reg_num_top_df = pd.merge(
        all_reg_num_df,
        reg_num_filter_df,
        on=[f'{ar}_{year_style}_period', 'right_person_name'],
        how='inner',
    )
    # sep_year_reg_num_top_df = pd.merge(
    #     sep_year_reg_num_df,
    #     reg_num_filter_df[['right_person_name']],
    #     on=['right_person_name'], 
    #     how='inner'
    # )
    sep_year_reg_num_top_df = sep_year_reg_num_df[sep_year_reg_num_df['right_person_name'].isin(reg_num_filter_df['right_person_name'])]
sep_year_reg_num_top_df

reg_num_top_df = pd.concat([all_reg_num_top_df, sep_year_reg_num_top_df], 
                           axis='index', ignore_index=True)
reg_num_top_df


Unnamed: 0,app_year_period,right_person_name,schmoch35,reg_num
0,1981-2010,キヤノン株式会社,9,23609.000
1,1981-2010,キヤノン株式会社,28,9938.833
2,1981-2010,キヤノン株式会社,2,9424.667
3,1981-2010,キヤノン株式会社,6,7389.333
4,1981-2010,キヤノン株式会社,3,6325.500
...,...,...,...,...
105079,2001-2010,四国電力株式会社,9,0.071
105080,2001-2010,五洋建設株式会社,31,0.071
105081,2001-2010,九州電力株式会社,31,0.071
105082,2001-2010,日本原子力発電株式会社,9,0.071


In [11]:
reg_num_top_df.to_csv(f'{output_dir}{ar}_{year_style}_{extract_population}_{top_p_or_num[0]}_{top_p_or_num[1]}.csv', 
                      encoding='utf-8', 
                      sep=',', 
                      index=False)


In [12]:
applicant_weight

'fraction'

In [13]:
top_p_or_num

('p', 3)