In [None]:
import pandas as pd
import numpy as np
import sys

sys.path.append('../../src')
from ecomplexity import ecomplexity
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
import io

import matplotlib.ticker as ptick
import networkx as nx
import networkx.algorithms.bipartite as bip

# plt.rcParams['font.family'] = 'Meiryo'
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 20

# 小数点以下 桁数 6
pd.options.display.float_format = '{:.3f}'.format


In [None]:
import initial_condition
from process import weight
from visualize import rank as vr


In [None]:
global data_dir, filter_dir, output_dir
data_dir = '../../data/interim/internal/filtered_before_agg/'
filter_dir = '../../data/interim/internal/filter_after_agg/'
output_dir = '../../data/interim/internal/filtered_after_agg/'
ex_dir = '../../data/processed/external/schmoch/'


In [None]:
# 初期条件
ar = initial_condition.AR
year_style = initial_condition.YEAR_STYLE

year_start = initial_condition.YEAR_START
year_end = initial_condition.YEAR_END
year_range = initial_condition.YEAR_RANGE

extract_population = initial_condition.EXTRACT_POPULATION
top_p_or_num = initial_condition.TOP_P_OR_NUM
# top_p_or_num = ('p', 100)
region_corporation = initial_condition.REGION_CORPORATION
# region_corporation = 'right_person_addr'
applicant_weight = initial_condition.APPLICANT_WEIGHT

classification = initial_condition.CLASSIFICATION
class_weight = initial_condition.CLASS_WEIGHT

filter_condition = f'{ar}_{year_style}_{extract_population}_reg_num_top_{top_p_or_num[0]}_{top_p_or_num[1]}_{region_corporation}'
input_condition = f'{ar}_{year_style}_{extract_population}_{top_p_or_num[0]}_{top_p_or_num[1]}_{region_corporation}_{applicant_weight}_{classification}_{class_weight}'



In [None]:
filter_condition

In [None]:
# 全体
all_df = pd.read_csv(f'{data_dir}japan.csv', 
                     encoding='utf-8', 
                     sep=',', 
                     usecols=['reg_num', 
                              region_corporation, 'right_person_addr',
                              f'{ar}_{year_style}', 
                              f'{classification}'], 
                     dtype={'reg_num': str, 
                            region_corporation: str, 
                            f'{ar}_{year_style}': np.int64, 
                            f'{classification}': str})

all_df = all_df[all_df[f'{ar}_{year_style}'].isin(range(year_start, year_end+1))]\
               .drop_duplicates()\
# display(all_df.head())


# 各期間
sep_year_df_dict = {}

for year in range(year_start, year_end+1, year_range):
    sep_year_df_dict[f'{year}-{year+year_range-1}'] = all_df[all_df[f'{ar}_{year_style}'].isin(range(year, year+year_range))]


In [None]:
all_df

In [None]:
# 特許分類による重みづけ
# 全体
if class_weight == 'fraction':
    all_df = weight.by_classification(all_df, region_corporation, classification)
elif class_weight == 'duplication':
    all_df['class_weight'] = 1
all_df[f'{ar}_{year_style}_period'] = f'{year_start}-{year_end}'


# 共同出願の重みづけ
# 全体
if applicant_weight == 'fraction':
    all_df = weight.by_applicant(all_df, region_corporation)
elif applicant_weight == 'duplication':
    all_df['applicant_weight'] = 1
all_df[f'{ar}_{year_style}_period'] = f'{year_start}-{year_end}'
all_df


In [None]:
# フィルタリング
reg_num_filter_df = pd.read_csv(f'{filter_dir}{filter_condition}.csv',
                                encoding='utf-8',
                                sep=',', 
                                usecols=[f'{ar}_{year_style}_period', region_corporation],
                                dtype=str)
reg_num_filter_df

In [None]:
df = pd.merge(all_df, reg_num_filter_df, on=[f'{ar}_{year_style}_period', 'right_person_name'], how='inner')
all_reg_num_df = df.copy()
all_reg_num_df['reg_num'] = 1 / all_reg_num_df['class_weight'] / all_reg_num_df['applicant_weight']
all_reg_num_df = all_reg_num_df.groupby([f'{ar}_{year_style}_period', 'right_person_addr', classification])[['reg_num']]\
                               .sum().reset_index(drop=False)\
                               .sort_values(['reg_num'], ascending=[False])
all_reg_num_df
# sep_year_reg_num_df_dict = sep_year_df_dict.copy()
# for period, sep_year_reg_num_df in sep_year_reg_num_df_dict.items():
#     sep_year_reg_num_df['reg_num'] = 1 / sep_year_reg_num_df['class_weight'] / sep_year_reg_num_df['applicant_weight']
#     sep_year_reg_num_df = sep_year_reg_num_df.groupby([f'{ar}_{year_style}_period', region_corporation, classification])[['reg_num']]\
#                                              .sum().reset_index(drop=False)\
#                                              .sort_values(['reg_num'], ascending=[False])
#     sep_year_reg_num_df_dict[period] = sep_year_reg_num_df
# sep_year_reg_num_df = pd.concat([sep_year_reg_num_df for sep_year_reg_num_df in sep_year_reg_num_df_dict.values()], axis='index', ignore_index=True)
# sep_year_reg_num_df

In [None]:
trade_cols = {'time':f'{ar}_{year_style}_period', 'loc':'right_person_addr', 'prod':classification, 'val':'reg_num'}
rename_col_dict = {'eci':'kci', 'pci':'tci'}
col_order_list = [f'{ar}_{year_style}_period', 'right_person_addr', classification, 'reg_num', 'rca', 'mcp', 'diversity', 'ubiquity', 'kci', 'tci']


In [None]:
def kh_ki(c_df, classification, n=19):
    kh1_ki1_df = pd.merge(c_df.copy(), 
                        c_df[c_df['mcp']==1].groupby(['right_person_addr'])[['ubiquity']].sum().reset_index(drop=False).copy().rename(columns={'ubiquity':'kh_1'}), 
                        on=['right_person_addr'], how='left')
    kh1_ki1_df = pd.merge(kh1_ki1_df.copy(), 
                        c_df[c_df['mcp']==1].groupby([classification])[['diversity']].sum().reset_index(drop=False).copy().rename(columns={'diversity':'ki_1'}), 
                        on=[classification], how='left')
    kh1_ki1_df['kh_1'] = kh1_ki1_df['kh_1'] / kh1_ki1_df['diversity']
    kh1_ki1_df['ki_1'] = kh1_ki1_df['ki_1'] / kh1_ki1_df['ubiquity']
    kh_ki_df = kh1_ki1_df.copy()
    for i in range(n):
        kh_ki_df = pd.merge(kh_ki_df, 
                            kh_ki_df[kh_ki_df['mcp']==1].groupby(['right_person_addr'])[[f'ki_{i+1}']].sum().reset_index(drop=False).copy()\
                                        .rename(columns={f'ki_{i+1}':f'kh_{i+2}'}), 
                            on=['right_person_addr'], how='left')
        kh_ki_df = pd.merge(kh_ki_df, 
                            kh_ki_df[kh_ki_df['mcp']==1].groupby([classification])[[f'kh_{i+1}']].sum().reset_index(drop=False).copy()\
                                        .rename(columns={f'kh_{i+1}':f'ki_{i+2}'}), 
                            on=[classification], how='left')
        kh_ki_df[f'kh_{i+2}'] = kh_ki_df[f'kh_{i+2}'] / kh_ki_df['diversity']
        kh_ki_df[f'ki_{i+2}'] = kh_ki_df[f'ki_{i+2}'] / kh_ki_df['ubiquity']
    return kh_ki_df


In [None]:
c_df = ecomplexity(all_reg_num_df,
                   cols_input = trade_cols, 
                   rca_mcp_threshold = 1)
# c_out_df = c_df.copy()
c_df = c_df[c_df['reg_num'] > 0]\
           .rename(columns=rename_col_dict)\
           [col_order_list]
c_df = pd.concat([kh_ki(c_df[c_df[f'{ar}_{year_style}_period'] == period], classification) for period in c_df[f'{ar}_{year_style}_period'].unique()], 
                 axis='index', 
                 ignore_index=True)
c_df[classification] = c_df[classification].astype(int)
schmoch_df = pd.read_csv(f'{ex_dir}35.csv', 
                         encoding='utf-8', 
                         sep=',', 
                         usecols=['Field_number', 'Field_en']
                         ).drop_duplicates()

c_df = pd.merge(c_df, 
                schmoch_df, 
                left_on=[classification], 
                right_on=['Field_number'], 
                how='left').drop(columns=['Field_number', classification]).rename(columns={'Field_en': classification})
# c_df = c_df[[f'{ar}_{year_style}_period', classification, 'tci']]
pre_df = c_df[[f'{ar}_{year_style}_period', classification, 'tci']].copy()

In [None]:
co_df = pd.read_csv(f'../../data/processed/internal/tech/{input_condition}.csv', 
                    encoding='utf-8',
                    sep=','
                    )[[f'{ar}_{year_style}_period', classification, 'tci']]
co_df

In [None]:
eu_df = pd.read_csv('../../data/processed/external/abroad/eu.csv', 
                    encoding='utf-8', 
                    sep=',')
eu_df

In [None]:
pre_co_df = pd.merge(pre_df.rename(columns={'tci':'pre_tci'}), co_df.rename(columns={'tci':'co_tci'}), on=[f'{ar}_{year_style}_period', classification], how='inner')
pre_co_df = pd.merge(pre_co_df, eu_df[['schmoch35', 'schmoch5']], on='schmoch35', how='left')
pre_co_df['schmoch5'] = pre_co_df['schmoch5'].replace('Mechanical engineering', 'Mechanical engineering, machinery')
pre_co_df['schmoch5'] = pre_co_df['schmoch5'].replace('Chemistry', 'Chemistry, pharmaceuticals')
pre_co_df['co_tci'] = (pre_co_df['co_tci'] - pre_co_df['co_tci'].min()) / (pre_co_df['co_tci'].max() - pre_co_df['co_tci'].min()) * 100
pre_co_df['pre_tci'] = (pre_co_df['pre_tci'] - pre_co_df['pre_tci'].min()) / (pre_co_df['pre_tci'].max() - pre_co_df['pre_tci'].min()) * 100
pre_co_df = pre_co_df.drop_duplicates().reset_index(drop=True)
pre_co_df['schmoch5'] = np.where(pre_co_df['schmoch35']=='Machine tools', 
                                 'Mechanical engineering, machinery', 
                                 np.where(pre_co_df['schmoch35']=='Analysis of biological materials', 
                                          'Instruments', 
                                          pre_co_df['schmoch5']))
pre_co_df['co_tci_rank'] = pre_co_df['co_tci'].rank(ascending=False, method='min')
pre_co_df['pre_tci_rank'] = pre_co_df['pre_tci'].rank(ascending=False, method='min')
pre_co_df


In [None]:
sample_df = all_df[~(all_df['right_person_name'].isin(df['right_person_name']))].copy()
sample_df[classification] = sample_df[classification].astype(int)
sample_df = pd.merge(sample_df, schmoch_df, left_on=[classification], right_on=['Field_number'], how='left').drop(columns=['Field_number', classification]).rename(columns={'Field_en': classification})
sample_df = pd.merge(sample_df, eu_df[['schmoch35', 'schmoch5']], on='schmoch35', how='left')\
                [[classification, 'schmoch5', 'reg_num', 'right_person_name', 'right_person_addr']]
sample_df['schmoch5'] = sample_df['schmoch5'].replace('Mechanical engineering', 'Mechanical engineering, machinery')
sample_df['schmoch5'] = sample_df['schmoch5'].replace('Chemistry', 'Chemistry, pharmaceuticals')
sample_df['schmoch5'] = np.where(sample_df['schmoch35']=='Machine tools', 
                                 'Mechanical engineering, machinery', 
                                 np.where(sample_df['schmoch35']=='Analysis of biological materials', 
                                          'Instruments', 
                                          sample_df['schmoch5']))
print(sample_df['schmoch35'].unique())
sample_df[sample_df['schmoch35']=='Digital communication'].groupby([classification, 'right_person_addr'], as_index=False).nunique().sort_values(['right_person_name'], ascending=[False])
sample_df.groupby([classification], as_index=False).nunique().sort_values(['right_person_name'], ascending=[False])
sample_df.groupby([classification, 'right_person_addr'], as_index=False).nunique().sort_values(['reg_num', classification], ascending=False).drop_duplicates(subset=[classification], keep='first')

In [None]:
c_df[(c_df[classification]=='Civil engineering')]

In [None]:
a = c_df.groupby([classification])[['right_person_addr']].nunique().sort_values(['right_person_addr'], ascending=[False])
b = c_df[[classification, 'ubiquity', 'tci']].drop_duplicates().sort_values(['tci'], ascending=[False])
c = pd.merge(a, b, on=[classification], how='inner').sort_values(['tci'], ascending=[False])
c['decrease_rate'] = 1 - (c['ubiquity'] / c['right_person_addr'])
c

In [None]:
pre_co_df.drop_duplicates().reset_index(drop=True).sort_values(['co_tci'], ascending=[False])

In [None]:
df_dict = {}
tech_color = {
        'Chemistry, pharmaceuticals': 'red',
        'Electrical engineering': 'blue',
        'Instruments': 'green', 
        'Mechanical engineering, machinery': 'orange',
        'Other fields': 'gray'
    }
combi_dict = {  # ind: [x, y, title, xlabel, ylabel, legend_loc]
    1: ["co_tci", "pre_tci", "relation between the TCIs in Japanese Corporations and Prefectures", "Corporations（period：1981-2010 fiscal year）", "Prefectures（period：1981-2010 fiscal year）", "center", ],
    2: ["co_tci_rank", "pre_tci_rank", "relation between the TCI rankings in Japanese Corporations and Prefectures", "Corporations（period：1981-2010 fiscal year）", "Prefectures（period：1981-2010 fiscal year）", "center", ],
    # 2: ["TCI_rank_jp", "TCI_rank_eu", "relation between the TCIs in Japanese corporation and EU regions", "Japanese Corporations ranking（period：1981-2010 fiscal year）", "EU Regions ranking（period：1985-2009 year）", "center", ],
    # 2: ["reg_num_jp", "reg_num_eu", "corr between the patent amounts in Japan and EU", "Japan（period：1981-2010 fiscal year）", "EU（period：1985-2009 year）", "center", ],
    # 3: ["reg_num_jp", "TCI_jp", "relation between the patent counts and the TCIs in Japan", "Patent Counts", "TCIs", "center left", ],
    # 4: ["TCI_jp", "reg_num_jp", "relation between the patent counts and the TCIs in Japan", "TCIs", "Patent Counts", "center left", ],
    # 5: ["reg_num_eu", "TCI_eu", "corr between the patent amounts in EU and TCI in EU", "EU（period：1985-2009 year）", "EU（period：1985-2009 year）", "center", ],
    # 2: ["TCI_eu", "TCI_jp", "corr between the TCIs in Japan and EU", "EU（period：1985-2009 year）", "Japan（period：1981-2010 fiscal year）", "center", ],
}
plt.rcParams['font.size'] = 24
plt.rcParams['font.family'] = 'Meiryo'
for i, combi in combi_dict.items():
    fig, ax = plt.subplots(figsize=(8, 8))
    period = f"{year_start}-{year_end}"
    corr_num = round(pre_co_df[combi[0]].corr(pre_co_df[combi[1]]), 3)
    print(period, corr_num)
    # ax.scatter(pre_co_df[combi[0]], pre_co_df[combi[1]],
    #            s=20, alpha=0.8, color="black", )
    # if i == 4:
    ax.axvline(x=pre_co_df[combi[0]].mean(), color="gray", linestyle="--", )
    ax.axhline(y=pre_co_df[combi[1]].mean(), color="gray", linestyle="--", )
    ax.set_title(combi[2]+'(corr=' + r"$\bf{" + str(corr_num)+ "}$" +')\n')
    if combi[0] in ["reg_num"]: ax.set_xscale("log")
    if combi[1] in ["reg_num"]: ax.set_yscale("log")
    x_min = pre_co_df[combi[0]].min()
    x_2smallest = (pre_co_df[combi[0]].nsmallest(2).iloc[1])
    y_2smallest = (pre_co_df[combi[1]].nsmallest(2).iloc[1])
    head_df = pre_co_df.head(5)
    between_df = pre_co_df.iloc[5:len(pre_co_df)-5, :]
    tail_df = pre_co_df.tail(5)
    if i != 5:
        # display(pre_co_df)
        # for i, row in head_df.iterrows():
        #     ax.text(row[combi[0]], row[combi[1]], f'{i+1} {row["schmoch35"]}', fontsize=18, color="red")
        #     ax.scatter(row[combi[0]], row[combi[1]], s=20, color="red")
        # for i, row in between_df.iterrows():
        #     ax.text(row[combi[0]], row[combi[1]], f'{i+1} {row["schmoch35"]}', fontsize=15, color="black")
        #     ax.scatter(row[combi[0]], row[combi[1]], s=20, color="black")
        # for i, row in tail_df.iterrows():
        #     ax.text(row[combi[0]], row[combi[1]], f'{i+1} {row["schmoch35"]}', fontsize=18, color="blue", )
        #     ax.scatter(row[combi[0]], row[combi[1]], s=20, color="blue")
        # for i, row in head_df.iterrows():
        #     ax.text(row[combi[0]], row[combi[1]], f'{i+1} {row["schmoch35"]}', fontsize=18, color="red")
            
            # if i == 4: ax.scatter(row[combi[0]], row[combi[1]], s=40, color=tech_color[row['schmoch5']], label=row['schmoch5'])
            # else: ax.scatter(row[combi[0]], row[combi[1]], s=40, color=tech_color[row['schmoch5']])
        # for i, row in between_df.iterrows():
        #     # ax.text(row[combi[0]], row[combi[1]], i+1, fontsize=15, color="black")
        #     if i == 7: ax.scatter(row[combi[0]], row[combi[1]], s=40, color=tech_color[row['schmoch5']], label=row['schmoch5'])
        #     else: ax.scatter(row[combi[0]], row[combi[1]], s=40, color=tech_color[row['schmoch5']])
            
        # for i, row in tail_df.iterrows():
        #     # ax.text(row[combi[0]], row[combi[1]], i+1, fontsize=18, color="blue")
        #     ax.scatter(row[combi[0]], row[combi[1]], s=40, color="blue", label=f'{i+1} {row["schmoch35"]}')
        for tech_color_key in tech_color.keys():
            ax.scatter(pre_co_df[pre_co_df['schmoch5']==tech_color_key][combi[0]], pre_co_df[pre_co_df['schmoch5']==tech_color_key][combi[1]], 
                       color=tech_color[tech_color_key], label=tech_color_key, 
                       s=60)
        # for i, row in pre_co_df.iterrows():
        #     ax.text(row[combi[0]], row[combi[1]], f'{i+1} {row["schmoch35"]}', fontsize=18, color="black")
        # for ind, row in head_df.iterrows():
        #     if ind == 1: ax.text(row[combi[0]]+1, row[combi[1]]-2, f'\n{ind+1} {row["schmoch35"]}', fontsize=20, color=tech_color[row['schmoch5']])
        #     else: ax.text(row[combi[0]]+1, row[combi[1]]-1, f'{ind+1} {row["schmoch35"]}', fontsize=20, color=tech_color[row['schmoch5']])
    # elif i == 2:
    #     for i, row in head_df.iterrows():
    #         ax.text(row[combi[0]], row[combi[1]], i+1, fontsize=18, color="red")
    #         ax.scatter(row[combi[0]], row[combi[1]], s=20, color="red")
    #     for i, row in between_df.iterrows():
    #         ax.text(row[combi[0]], row[combi[1]], i+1, fontsize=15, color="black")
    #         ax.scatter(row[combi[0]], row[combi[1]], s=20, color="black")
    #     for i, row in tail_df.iterrows():
    #         ax.text(row[combi[0]], row[combi[1]], i+1, fontsize=18, color="blue", )
    #         ax.scatter(row[combi[0]], row[combi[1]], s=20, color="blue")
    ax.set_ylabel(combi[4])
    ax.set_xlabel(combi[3])
    # ax.set_xscale('log')
    ax.legend(loc=combi[5], fontsize=20, bbox_to_anchor=(1.65, 0.5), borderaxespad=0, prop={'weight': 'bold'})
    plt.show()
