In [1]:
import copy
import numpy as np
import os
import pandas as pd
pd.options.display.max_rows = 8

## 使い方
- pathを変更する

In [2]:
paths = ['tfidf_text/0thLayer','tfidf_text/1stLayer','tfidf_text/2ndLayer']

In [3]:
all_nodes = set()
link_dict = dict()
link_df = pd.DataFrame(columns=['from','to','value'])

querys = [os.path.splitext(x)[0] for x in os.listdir('tfidf_text/0thLayer')]
for path in paths:
    next_querys = []
    for query in querys:
        if not query.startswith('.'):
            all_nodes.add(query)
            link_dict[query] = []
            records = []
            try:
                file = open('{0}/{1}.txt'.format(path,query),'r')
            except FileNotFoundError:
                continue
            lines = file.readlines()
            for i in range(5): #TF値上位5個
                to_query = lines[i].split(',')[0][2:-1]
                value = float(lines[i].split(',')[1][1:-2])
                if not query == to_query:  #セルフノード削除(例外処理)
                    all_nodes.add(to_query)
                    link_dict[query].append(to_query)
                    records.append([query, to_query, value])
                    next_querys.append(to_query)
            df_ = pd.DataFrame(records,columns=link_df.columns)
            link_df = link_df.append(df_)
            
    querys = next_querys

link_df = link_df.reset_index(drop=True)

In [4]:
# link_df.to_csv('links.csv',encoding='utf-8',index=False)

In [5]:
#link_df = pd.read_csv('links.csv')

In [6]:
link_df

Unnamed: 0,from,to,value
0,慶應義塾大学,研究,0.025952
1,慶應義塾大学,学部,0.022282
2,慶應義塾大学,月,0.020959
3,慶應義塾大学,学生,0.020782
...,...,...,...
443,学科,社会,0.041647
444,学科,専門,0.032136
445,学科,福祉,0.030243
446,学科,紹介,0.028814


初期化

In [7]:
len(link_dict)

84

In [8]:
len(all_nodes)

214

In [9]:
len(link_df)

447

In [10]:
tmp = pd.DataFrame()
tmp['from'] = link_df['to']
tmp['to'] = link_df['from']
tmp['value'] = link_df['value']
link_df = link_df.append(tmp)
link_df.reset_index(drop=True)

new_link_dict = copy.deepcopy(link_dict)
for key,value in link_dict.items():
    for val in value:
        if val in link_dict.keys():
            new_link_dict[val].append(key)
        else:
            new_link_dict[val] = [key]
link_dict = new_link_dict

In [11]:
univs = {
    "東京大学":"from_ut",
    "京都大学":"from_ky",
    "早稲田大学":"from_wa",
    "慶應義塾大学":"from_ko",
    "明治大学":"from_me"
}

df版

In [12]:
# node_df = pd.DataFrame(
#     index=range(len(all_nodes)), 
#     columns=[
#         'query',
#         'from_ut',
#         'from_ky',
#         'from_wa',
#         'from_ko',
#         'from_me',
# #         'label',
# #         'shortest_step'
#     ])
# node_df['query'] = list(all_nodes)
# node_df = node_df.fillna(float(99))

In [13]:
# for key, value in univs.items():
#     node_df.loc[node_df['query'] == key, value] = 0

In [14]:
# from_querys = univs.keys()

In [15]:
# next_step = link_df[link_df['from'].isin(from_querys)]

In [16]:
# for from_query in from_querys:
#     value = node_df.loc[node_df['query'] == from_query,univs.values()]
#     for to_query in link_dict[from_query]:
#         for column in univs.values():
#             node_df.loc[node_df['query'] == to_query, column] = value

dict版

In [17]:
def check_layer(from_querys,node_dict,link_dict,univs):
    for from_query in from_querys:
        distance_dict = copy.deepcopy(node_dict[from_query])
        for key in distance_dict.keys():
            distance_dict[key] += 1
        try:
            for to_query in link_dict[from_query]:
                for key in univs.values():
                    node_dict[to_query][key] = min(distance_dict[key],node_dict[to_query][key])
        except KeyError:
            pass
    
    return node_dict

In [18]:
def get_next_from_querys(link_df,from_querys):
    next_step = link_df[link_df['from'].isin(from_querys)]
    
    return list(next_step['to'])

In [19]:
node_dict = {}
for query in all_nodes:
    node_dict[query] = {
    "from_ut":99,
    "from_ky":99,
    "from_wa":99,
    "from_ko":99,
    "from_me":99
}

#初期化
node_dict["東京大学"]["from_ut"] = 0
node_dict["京都大学"]["from_ky"] = 0
node_dict["早稲田大学"]["from_wa"] = 0
node_dict["慶應義塾大学"]["from_ko"] = 0
node_dict["明治大学"]["from_me"] = 0

In [20]:
#1層目
from_querys = list(univs.keys())

In [21]:
for x in range(30):
    node_dict = check_layer(from_querys,node_dict,link_dict,univs)
    from_querys = get_next_from_querys(link_df,from_querys)

In [22]:
list_of_list = []
for key, value in node_dict.items():
    list_of_list.append([key,value['from_ut'],value['from_ky'],value['from_wa'],value['from_ko'],value['from_me']])
node_df = pd.DataFrame(list_of_list,columns=['query','from_ut','from_ky','from_wa','from_ko','from_me'])

In [23]:
node_df['shortest_step'] = np.min(node_df[list(univs.values())],axis=1)

In [24]:
for column in ['ut','ky','wa','ko','me']:
    node_df['label_{}'.format(column)] = node_df['from_{}'.format(column)] == node_df['shortest_step']

In [25]:
univ_df = pd.DataFrame()
univ_df = univ_df.append(node_df[node_df['query'] == '東京大学']).append(node_df[node_df['query'] == '京都大学'].append(node_df[node_df['query'] == '早稲田大学'])).append(node_df[node_df['query'] == '慶應義塾大学']).append(node_df[node_df['query'] == '明治大学'])

In [26]:
univ_df

Unnamed: 0,query,from_ut,from_ky,from_wa,from_ko,from_me,shortest_step,label_ut,label_ky,label_wa,label_ko,label_me
36,東京大学,0,2,2,2,4,0,True,False,False,False,False
162,京都大学,2,0,2,2,3,0,False,True,False,False,False
91,早稲田大学,2,2,0,2,2,0,False,False,True,False,False
171,慶應義塾大学,2,2,2,0,2,0,False,False,False,True,False
41,明治大学,4,3,2,2,0,0,False,False,False,False,True


In [27]:
node_df = node_df.drop(univ_df.index)

In [28]:
node_df = univ_df.append(node_df).reset_index(drop=True)

In [29]:
node_df['id'] = node_df.index

In [30]:
node_df

Unnamed: 0,query,from_ut,from_ky,from_wa,from_ko,from_me,shortest_step,label_ut,label_ky,label_wa,label_ko,label_me,id
0,東京大学,0,2,2,2,4,0,True,False,False,False,False,0
1,京都大学,2,0,2,2,3,0,False,True,False,False,False,1
2,早稲田大学,2,2,0,2,2,0,False,False,True,False,False,2
3,慶應義塾大学,2,2,2,0,2,0,False,False,False,True,False,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,気象,5,3,4,4,4,3,False,True,False,False,False,210
211,掲載,3,5,5,5,7,3,True,False,False,False,False,211
212,問い合わせ,4,4,3,3,2,2,False,False,False,False,True,212
213,試合,3,3,2,2,2,2,False,False,True,True,True,213


In [31]:
def get_node_index(x):
    number = node_df[node_df['query']==x]['id']
    return int(number)

In [32]:
link_df['from_id'] = link_df['from'].map(get_node_index)
link_df['to_id'] = link_df['to'].map(get_node_index)

In [33]:
link_df

Unnamed: 0,from,to,value,from_id,to_id
0,慶應義塾大学,研究,0.025952,3,69
1,慶應義塾大学,学部,0.022282,3,147
2,慶應義塾大学,月,0.020959,3,109
3,慶應義塾大学,学生,0.020782,3,8
...,...,...,...,...,...
443,社会,学科,0.041647,44,33
444,専門,学科,0.032136,134,33
445,福祉,学科,0.030243,102,33
446,紹介,学科,0.028814,178,33


In [34]:
# node_df.to_csv('nodes5.csv')
# # link_df = link_df.iloc[:int(len(link_df)/2)]
link_df.to_csv('links_final.csv')

In [35]:
# node_df = pd.read_csv('nodes5.csv', index_col=0)
# node_df

In [36]:
import random

def set_label(x):
    label = []
    univ = ['ut','ky','wa','ko','me']
    for i in range(len(univ)):
        if x['label_{}'.format(univ[i])]:
            label.append(i)
    return random.choice(label)

In [37]:
shortest_steps = sorted(node_df.shortest_step.unique())
if shortest_steps[-1] == 99:
    shortest_steps = shortest_steps[:-1]
shortest_steps

[0, 1, 2, 3, 4, 5]

In [38]:
label_dict = {}
# link_dict = {}
for shortest_step in shortest_steps:
    step_node_df = node_df[node_df.shortest_step == shortest_step]
    for _, row in step_node_df.iterrows():
        label = []
        for i in range(len(univ)):
            if row['label_{}'.format(univ[i])]:
                label.append(i)
        if shortest_step == 0:
            r = random.choice(label)
            label_dict[row['id']] = r
        elif shortest_step == 1:
            r = random.choice(label)
            label_dict[row['id']] = r
        else:
            if len(label) == 1:
                label_dict[row['id']] = label[0]
            else:
                from_ids = link_df[link_df.to_id == row['id']].from_id
                decided_nodes = set(label_dict.keys())
                min_shortest_step_id = -1
                min_step = -1
                for from_id in from_ids:
                    if from_id in decided_nodes:
                        s = node_df[node_df.id == from_id].shortest_step.values[0]
                        if min_shortest_step_id == -1:
                            min_shortest_step_id = from_id
                            min_step = s
                        else:
                            if s < min_step:
                                min_shortest_step_id = from_id
                                min_step = s
                label_dict[row['id']] = label_dict[min_shortest_step_id]

NameError: name 'univ' is not defined

In [39]:
label_df = pd.DataFrame.from_dict(label_dict, orient='index').reset_index().rename(columns={'index':'id', 0:'label'})
label_df

Unnamed: 0,id


In [40]:
node_with_label_df = pd.merge(node_df, label_df, on='id')
node_with_label_df

Unnamed: 0,query,from_ut,from_ky,from_wa,from_ko,from_me,shortest_step,label_ut,label_ky,label_wa,label_ko,label_me,id


In [328]:
node_with_label_df.to_csv('nodes_final.csv', index=False)

## 以下不要

In [None]:
node_df['label'] = 

In [86]:
node_df['label'] = node_df.apply(set_label,axis=1)

In [87]:
node_df.head(9)

Unnamed: 0,query,from_ut,from_ky,from_wa,from_ko,from_me,shortest_step,label_ut,label_ky,label_wa,label_ko,label_me,id,label
0,東京大学,0,2,2,2,4,0,True,False,False,False,False,0,0
1,京都大学,2,0,2,2,3,0,False,True,False,False,False,1,1
2,早稲田大学,2,2,0,2,2,0,False,False,True,False,False,2,2
3,慶應義塾大学,2,2,2,0,2,0,False,False,False,True,False,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,卒業生,4,4,2,3,3,2,False,False,True,False,False,5,2
6,受験,5,5,3,5,5,3,False,False,True,False,False,6,2
7,校舎,4,4,2,4,4,2,False,False,True,False,False,7,2
8,円,5,5,3,5,5,3,False,False,True,False,False,8,2


In [88]:
node_df.to_csv('nodes5labels.csv', index=False)

In [40]:
labeled_df = node_df.loc[:,['index','label']]

In [41]:
labeled_df

Unnamed: 0,index,label
0,0,0
1,1,1
2,2,2
3,3,3
...,...,...
210,210,2
211,211,2
212,212,2
213,213,2


In [42]:
labeled_df.to_csv('labels.csv')