In [1]:
import networkx as nx
from tqdm import tqdm
from leafer import Leafer

import pandas as pd

In [2]:
pwd

'/home/jovyan/work/DataInterface'

In [3]:
G = nx.read_edgelist("IsA_Graph2.edgelist", delimiter=" ", create_using=nx.DiGraph)

l = Leafer(G)
iterator = l.leafs_generator()

In [4]:
next(iterator)

('allograph',
 {0: {'character', 'signature'},
  1: {'adult',
   'attribute',
   'character_string_token_inscribed',
   'hand_written_document',
   'icon',
   'name',
   'portrayal',
   'primitive_type',
   'recommendation',
   'repute',
   'sheet',
   'spiritual_strength',
   'trait',
   'tune',
   'what_in_dark',
   'work_of_aesthetic_interest',
   'written_symbol'}})

### DataSet

In [5]:
def get_hyper(word, relation):

    zero_lvl = [hyper.replace('_', ' ') for hyper in relation[0]]
    one_lvl = [hyper.replace('_', ' ') for hyper in relation[1]]
    
    output_one = [[word.replace('_', ' '), '\t'.join(zero_lvl + one_lvl)]]
    return pd.DataFrame(output_one, columns=['word', 'hypernym'])

In [6]:
DATASET = pd.DataFrame(columns=['word', 'hypernym'])

In [7]:
G = nx.read_edgelist("IsA_Graph2.edgelist", delimiter=" ", create_using=nx.DiGraph)

l = Leafer(G)
iterator = l.leafs_generator()

In [None]:
while 1:
    try:
        word, relation = next(iterator)
        DATASET = pd.concat([DATASET, get_hyper(word, relation)])
        DATASET.to_csv('custom_dataset.csv', index=False)
                
    except StopIteration:
        break

In [14]:
df = pd.read_csv('custom_dataset.csv')
df

Unnamed: 0,word,hypernym
0,allograph,character\tsignature\tportrayal\trecommendatio...
1,asterisk,character\tsoftware\tlabour intensive enterpri...
2,check character,character\tportrayal\trecommendation\tattribut...
3,double dagger,character\tportrayal\trecommendation\tattribut...
4,every person,character\tchild of two people\tportrayal\trec...
...,...,...
89345,xml schema datatype int,xml schema datatype long\tinteger extent
89346,yemeni fils,yemeni monetary unit\tmonetary unit
89347,yugo gv,yugo\tyugoslavian car\tcar
89348,zip code nine digit,zip code\tnumber string\tcode\tpostal code


In [37]:
import re 
patter = re.compile('[A-Za-z ]')

bad_words_mask = df.word.apply(lambda x: len(re.sub(r"[A-Za-z0-9',. ]+", '', x)) > 0)

In [38]:
df.loc[bad_words_mask, :]

Unnamed: 0,word,hypernym
2074,dihomo Î³ linolenic acid,chemical compound\tfacet collection\tchemical ...
2276,frÃ©my's salt,chemical compound\tfacet collection\tchemical ...
4264,Ã©lysÃ©e palace,building\twhere people might live\tcreation\tr...
4463,dÃ©classÃ©e,film\twebsite\tway of recording moving images\...
6905,15 Â° c calorie,calorie\twork unit\tunit of energy
...,...,...
82448,drÃšents,dutch low saxon\tlow german
82580,dÃ¶hle body,inclusion body\tbody
86682,kÃ¶lsch,ripuarian\tcentral franconian
88097,Ï,ordinal\tlimit ordinal\tinfinity\tnon gradable...


In [43]:
bad_hyper_mask = df.hypernym.apply(lambda x: len(re.sub(r"[A-Za-z0-9',.\t ]+", '', x)) > 0)

In [44]:
df.loc[bad_hyper_mask, :]

Unnamed: 0,word,hypernym
880,onion bulb,living thing\tonion\tlayered thing\tbulb\ttang...
3952,peppermint,plant kingdom\tplant\tmint\talbum\tactor\tplan...
14551,coriander,spice\therb\tplant\therb plant\therbaceous pla...
16153,spearmint,plant\tmint\tcandy\tactor\tbuilding complex\tf...
17977,chaste tree,tree\tè§£è¡šè¥\tfigure\tliving thing\tfixed o...
...,...,...
78800,variegated horsetail,horsetail\tfern ally\therb\tè§£è¡šè¥
78801,wood horsetail,horsetail\tfern ally\therb\tè§£è¡šè¥
83115,mahuang,ephedra\tshrub\tgnetophytes\tè§£è¡šè¥
83777,when engaged significant other,fiance\tfiancÃ©\tbetrothed


In [53]:
final_mask = (bad_words_mask) | (bad_hyper_mask)

In [55]:
df[~final_mask].to_csv('dataset.csv', index=False)

In [58]:
dataset = pd.read_csv('dataset.csv')

In [60]:
dataset

Unnamed: 0,word,hypernym
0,allograph,character\tsignature\tportrayal\trecommendatio...
1,asterisk,character\tsoftware\tlabour intensive enterpri...
2,check character,character\tportrayal\trecommendation\tattribut...
3,double dagger,character\tportrayal\trecommendation\tattribut...
4,every person,character\tchild of two people\tportrayal\trec...
...,...,...
89021,xml schema datatype int,xml schema datatype long\tinteger extent
89022,yemeni fils,yemeni monetary unit\tmonetary unit
89023,yugo gv,yugo\tyugoslavian car\tcar
89024,zip code nine digit,zip code\tnumber string\tcode\tpostal code


In [4]:
dataset = pd.read_csv('dataset.csv')

In [29]:
dataset = dataset.sample(frac=1).reset_index(drop=True)

In [24]:
from sklearn.model_selection import train_test_split

In [34]:
train_val = dataset[:-1000].copy()

In [43]:
X_train, X_test, y_train, y_test = train_test_split(train_val.word, 
                                                    train_val.hypernym, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [51]:
import os
os.mkdir('data')

In [53]:
train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train.to_csv('data/train.csv', index=False)

In [52]:
validation = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
validation.to_csv('data/validation.csv', index=False)

In [54]:
test = dataset[-1000:].reset_index(drop=True).copy()
test.to_csv('data/test.csv', index=False)