In [1]:
import pandas as pd
import numpy as np
import sys
from tqdm.notebook import tqdm as tqdm_notebook
from transformers import DistilBertTokenizer, logging
import spacy
from spacy import displacy
import torch
import pickle
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_lg')
sys.path.insert(0, '../src/models/')
sys.path.insert(0, '../src/features/')

In [2]:
logging.set_verbosity_error()
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from predict_model import load_simBERT
from build_features import similarity_matrix as vector_values

model = load_simBERT()

In [3]:
df = pd.read_csv('../data/external/PalmTraits_1.0.txt', 
                 sep='\t', encoding='Latin-1')

df.set_index('SpecName', inplace=True)

In [41]:
string1 = 'The stem is erect.'
string2 = 'The stem is erect. Erect slender. The stem is ringed. Ringed scar.'
string3 = 'The stem is erect. The stem diameter is 9 centimeter. The stem height is 9 meter. The stem is solitary. The stem is not armed. The stem is not climbing.'

In [42]:
doc1_tok = vector_values([string1, string2, string3], model=model)
matrix = cosine_similarity(doc1_tok, doc1_tok)

In [43]:
matrix

array([[0.99999976, 0.90476704, 0.9101722 ],
       [0.90476704, 1.0000001 , 0.9194746 ],
       [0.9101722 , 0.9194746 , 0.9999999 ]], dtype=float32)

In [15]:
triples_dict = pickle.load(open(f'../data/description/triples_palm_dataset.pkl', 'rb'))

In [16]:
palm_list = list(triples_dict.keys())

In [17]:
palm = palm_list[0]

In [18]:
palm_list[0]

'Actinokentia divaricata'

In [19]:
df.loc[palm]

accGenus                       Actinokentia
accSpecies                       divaricata
PalmTribe                           Areceae
PalmSubfamily                    Arecoideae
Climbing                                  0
Acaulescent                               0
Erect                                     1
StemSolitary                              1
StemArmed                                 0
LeavesArmed                               0
MaxStemHeight_m                           9
MaxStemDia_cm                             9
UnderstoreyCanopy                    canopy
MaxLeafNumber                             7
Max_Blade_Length_m                     2.35
Max_Rachis_Length_m                     2.1
Max_Petiole_length_m                    1.3
AverageFruitLength_cm                   2.6
MinFruitLength_cm                       1.8
MaxFruitLength_cm                       3.4
AverageFruitWidth_cm                    1.1
MinFruitWidth_cm                        0.8
MaxFruitWidth_cm                

In [44]:
#triples_dict[palm]

In [51]:
d= {
   'a1': {
         'b1': 'c1',
         'b2': 'c2'
         },
   'a2': {
         'b1': 'c1',
         'b2': 'c2'
         },
}

In [52]:
d['a1'].get('b1')

'c1'

In [53]:
for k in [k for k in d if d[k].get('b1')]:
    k